2021-02-01 14:31:20 +09:00
|
|
|
/*
|
|
|
|
* ******************************************************************************
|
|
|
|
* *
|
|
|
|
* *
|
|
|
|
* * This program and the accompanying materials are made available under the
|
|
|
|
* * terms of the Apache License, Version 2.0 which is available at
|
|
|
|
* * https://www.apache.org/licenses/LICENSE-2.0.
|
|
|
|
* *
|
2021-02-01 17:47:29 +09:00
|
|
|
* * See the NOTICE file distributed with this work for additional
|
|
|
|
* * information regarding copyright ownership.
|
2021-02-01 14:31:20 +09:00
|
|
|
* * Unless required by applicable law or agreed to in writing, software
|
|
|
|
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
* * License for the specific language governing permissions and limitations
|
|
|
|
* * under the License.
|
|
|
|
* *
|
|
|
|
* * SPDX-License-Identifier: Apache-2.0
|
|
|
|
* *****************************************************************************
|
|
|
|
*/
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
package org.deeplearning4j.text.documentiterator;
|
|
|
|
|
2019-06-13 20:40:40 +10:00
|
|
|
import org.deeplearning4j.BaseDL4JTest;
|
2019-06-06 15:21:15 +03:00
|
|
|
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
|
|
|
|
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
|
|
|
|
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
|
2021-03-16 11:57:24 +09:00
|
|
|
import org.junit.jupiter.api.Test;
|
2020-04-29 11:19:26 +10:00
|
|
|
import org.nd4j.common.io.ClassPathResource;
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
2021-03-16 11:57:24 +09:00
|
|
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
2022-09-20 15:40:53 +02:00
|
|
|
|
2019-06-13 20:40:40 +10:00
|
|
|
public class DefaultDocumentIteratorTest extends BaseDL4JTest {
|
2019-06-06 15:21:15 +03:00
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testDocumentIterator() throws Exception {
|
|
|
|
ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
|
|
|
|
File f = reuters5250.getFile();
|
|
|
|
|
|
|
|
DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());
|
|
|
|
|
|
|
|
InputStream doc = iter.nextDocument();
|
|
|
|
|
|
|
|
TokenizerFactory t = new DefaultTokenizerFactory();
|
|
|
|
Tokenizer next = t.create(doc);
|
|
|
|
String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
|
|
|
|
///PEARSON CONCENTRATES ON FOUR SECTORS
|
|
|
|
int count = 0;
|
|
|
|
while (next.hasMoreTokens() && count < list.length) {
|
|
|
|
String token = next.nextToken();
|
|
|
|
assertEquals(list[count++], token);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
doc.close();
|
|
|
|
}
|
|
|
|
}
|