cavis/datavec/datavec-excel/src/main/java/org/datavec/poi/excel/ExcelRecordReader.java

/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  *  See the NOTICE file distributed with this work for additional
 *  *  information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

package org.datavec.poi.excel;

import org.apache.poi.ss.usermodel.*;
import org.datavec.api.conf.Configuration;
import org.datavec.api.records.Record;
import org.datavec.api.records.metadata.RecordMetaDataIndex;
import org.datavec.api.records.reader.impl.FileRecordReader;
import org.datavec.api.split.InputSplit;
import org.datavec.api.writable.BooleanWritable;
import org.datavec.api.writable.DoubleWritable;
import org.datavec.api.writable.Text;
import org.datavec.api.writable.Writable;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

public class ExcelRecordReader extends FileRecordReader {
    //originally from CSVRecordReader
    private boolean skippedLines = false;
    protected int skipNumLines = 0;
    public final static String SKIP_NUM_LINES = NAME_SPACE + ".skipnumlines";

    private Iterator<Sheet> sheetIterator;
    private Iterator<Row> rows;
    // Create a DataFormatter to format and get each cell's value as String
    private DataFormatter dataFormatter = new DataFormatter();
    private Workbook currWorkBook;
    //we should ensure that the number of columns is consistent across all worksheets
    private int numColumns = -1;

    /**
     * Skip skipNumLines number of lines
     * @param skipNumLines the number of lines to skip
     */
    public ExcelRecordReader(int skipNumLines) {
        this.skipNumLines = skipNumLines;
    }


    public ExcelRecordReader() {
        this(0);
    }

    @Override
    public boolean hasNext() {
        if (!skipLines())
            throw new NoSuchElementException("No next element found!");
        return skipLines() && super.hasNext() ||
                sheetIterator != null && sheetIterator.hasNext()
                || rows != null && rows.hasNext();
    }


    private boolean skipLines() {
        if (!skippedLines && skipNumLines > 0) {
            for (int i = 0; i < skipNumLines; i++) {
                if (!super.hasNext()) {
                    return false;
                }
                super.next();
            }
            skippedLines = true;
        }
        return true;
    }

    @Override
    public List<Writable> next() {
        return nextRecord().getRecord();
    }

    @Override
    public Record nextRecord(){
        //start at top tracking rows
        if(rows != null && rows.hasNext()) {
            Row currRow = rows.next();
            List<Writable> ret = new ArrayList<>(currRow.getLastCellNum());
            for(Cell cell: currRow) {
                String cellValue = dataFormatter.formatCellValue(cell);
                ret.add(new Text(cellValue));
            }
            Record record = new org.datavec.api.records.impl.Record(ret,
                                    new RecordMetaDataIndex(
                                            currRow.getRowNum(),
                                            super.currentUri,
                                            ExcelRecordReader.class));
            return record;
        }
        // next track sheets
        else if(sheetIterator != null && sheetIterator.hasNext()) {
            Sheet sheet = sheetIterator.next();
            rows = sheet.rowIterator();
            Row currRow = rows.next();
            Record record = new org.datavec.api.records.impl.Record(rowToRecord(currRow),
                                new RecordMetaDataIndex(
                                    currRow.getRowNum(),
                                    super.currentUri,
                                    ExcelRecordReader.class));
            return record;

        }


        //finally extract workbooks from files and iterate over those starting again at top
        try(InputStream is = streamCreatorFn.apply(super.locationsIterator.next())) {
            // Creating a Workbook from an Excel file (.xls or .xlsx)
            try {
                if (currWorkBook != null) {
                    currWorkBook.close();
                }

                this.currWorkBook = WorkbookFactory.create(is);
                this.sheetIterator = currWorkBook.sheetIterator();
                Sheet sheet = sheetIterator.next();
                rows = sheet.rowIterator();
                Row currRow = rows.next();
                Record record = new org.datavec.api.records.impl.Record(rowToRecord(currRow),
                        new RecordMetaDataIndex(
                                currRow.getRowNum(),
                                super.currentUri,
                                ExcelRecordReader.class));
                return record;

            } catch (Exception e) {
                throw new IllegalStateException("Error processing row", e);
            }
        } catch (IOException e){
            throw new RuntimeException("Error reading from stream", e);
        }

    }

    @Override
    public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
        super.initialize(conf, split);
        this.skipNumLines = conf.getInt(SKIP_NUM_LINES,0);
    }

    @Override
    public void reset() {
        super.reset();
        skippedLines = false;
    }


    private List<Writable> rowToRecord(Row currRow) {
        if(numColumns < 0) {
            numColumns = currRow.getLastCellNum();
        }

        if(currRow.getLastCellNum() != numColumns) {
            throw new IllegalStateException("Invalid number of columns for row. First number of columns found was " + numColumns + " but row " + currRow.getRowNum() + " was " + currRow.getLastCellNum());
        }

        List<Writable> ret = new ArrayList<>(currRow.getLastCellNum());
        for(Cell cell: currRow) {
            String cellValue = dataFormatter.formatCellValue(cell);
            switch(cell.getCellTypeEnum()) {
                case BLANK: ret.add(new Text("")); break;
                case STRING: ret.add(new Text("")); break;
                case BOOLEAN: ret.add(new BooleanWritable(Boolean.valueOf(cellValue))); break;
                case NUMERIC: ret.add(new DoubleWritable(Double.parseDouble(cellValue))); break;
                default: ret.add(new Text(cellValue));
            }
        }

        return ret;

    }


}