214 lines
6.6 KiB
Java
214 lines
6.6 KiB
Java
/*
|
|
* ******************************************************************************
|
|
* *
|
|
* *
|
|
* * This program and the accompanying materials are made available under the
|
|
* * terms of the Apache License, Version 2.0 which is available at
|
|
* * https://www.apache.org/licenses/LICENSE-2.0.
|
|
* *
|
|
* * See the NOTICE file distributed with this work for additional
|
|
* * information regarding copyright ownership.
|
|
* * Unless required by applicable law or agreed to in writing, software
|
|
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
* * License for the specific language governing permissions and limitations
|
|
* * under the License.
|
|
* *
|
|
* * SPDX-License-Identifier: Apache-2.0
|
|
* *****************************************************************************
|
|
*/
|
|
|
|
package org.deeplearning4j.text.sentenceiterator;
|
|
|
|
import lombok.NonNull;
|
|
|
|
import org.nd4j.common.util.ThreadUtils;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.util.concurrent.ArrayBlockingQueue;
|
|
import java.util.concurrent.TimeUnit;
|
|
import java.util.concurrent.atomic.AtomicBoolean;
|
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
|
|
|
@Deprecated
|
|
public class PrefetchingSentenceIterator implements SentenceIterator {
|
|
|
|
private SentenceIterator sourceIterator;
|
|
private int fetchSize;
|
|
private AsyncIteratorReader reader;
|
|
private SentencePreProcessor preProcessor;
|
|
|
|
protected static final Logger log = LoggerFactory.getLogger(PrefetchingSentenceIterator.class);
|
|
|
|
private PrefetchingSentenceIterator() {
|
|
|
|
}
|
|
|
|
/**
|
|
* Here we start async readers
|
|
*/
|
|
private void init() {
|
|
reader = new AsyncIteratorReader(sourceIterator, fetchSize, this.preProcessor);
|
|
reader.start();
|
|
}
|
|
|
|
@Override
|
|
public String nextSentence() {
|
|
return reader.nextLine();
|
|
}
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
return (reader != null) ? reader.hasMoreLines() : false;
|
|
}
|
|
|
|
@Override
|
|
public void reset() {
|
|
if (reader != null)
|
|
reader.reset();
|
|
}
|
|
|
|
@Override
|
|
public void finish() {
|
|
if (reader != null)
|
|
reader.terminate();
|
|
}
|
|
|
|
@Override
|
|
public SentencePreProcessor getPreProcessor() {
|
|
return preProcessor;
|
|
}
|
|
|
|
@Override
|
|
public void setPreProcessor(SentencePreProcessor preProcessor) {
|
|
this.preProcessor = preProcessor;
|
|
}
|
|
|
|
@Override
|
|
protected void finalize() throws Throwable {
|
|
if (reader != null)
|
|
reader.terminate();
|
|
super.finalize();
|
|
}
|
|
|
|
public static class Builder {
|
|
private SentenceIterator iterator;
|
|
private int fetchSize = 10000;
|
|
private SentencePreProcessor preProcessor;
|
|
|
|
public Builder(@NonNull SentenceIterator iterator) {
|
|
this.iterator = iterator;
|
|
}
|
|
|
|
public Builder setFetchSize(int fetchSize) {
|
|
this.fetchSize = fetchSize;
|
|
return this;
|
|
}
|
|
|
|
public Builder setSentencePreProcessor(@NonNull SentencePreProcessor preProcessor) {
|
|
this.preProcessor = preProcessor;
|
|
return this;
|
|
}
|
|
|
|
public PrefetchingSentenceIterator build() {
|
|
PrefetchingSentenceIterator pre = new PrefetchingSentenceIterator();
|
|
pre.sourceIterator = this.iterator;
|
|
pre.fetchSize = this.fetchSize;
|
|
pre.preProcessor = this.preProcessor;
|
|
|
|
pre.init();
|
|
return pre;
|
|
}
|
|
}
|
|
|
|
private class AsyncIteratorReader extends Thread implements Runnable {
|
|
private SentenceIterator iterator;
|
|
private int fetchSize;
|
|
private AtomicBoolean shouldTerminate = new AtomicBoolean(false);
|
|
private ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
|
|
private SentencePreProcessor preProcessor;
|
|
private AtomicBoolean isRunning = new AtomicBoolean(true);
|
|
private ArrayBlockingQueue<String> buffer;
|
|
|
|
public AsyncIteratorReader(@NonNull SentenceIterator iterator, int fetchSize,
|
|
SentencePreProcessor preProcessor) {
|
|
this.iterator = iterator;
|
|
this.fetchSize = fetchSize;
|
|
this.preProcessor = preProcessor;
|
|
|
|
buffer = new ArrayBlockingQueue<>(fetchSize * 3);
|
|
this.setName("AsyncIteratorReader thread");
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
while (!shouldTerminate.get()) {
|
|
if (iterator.hasNext())
|
|
isRunning.set(true);
|
|
else
|
|
ThreadUtils.uncheckedSleep(50);
|
|
while (!shouldTerminate.get() && iterator.hasNext()) {
|
|
|
|
int cnt = 0;
|
|
if (buffer.size() < fetchSize) {
|
|
while (!shouldTerminate.get() && cnt < fetchSize && iterator.hasNext()) {
|
|
try {
|
|
lock.writeLock().lock();
|
|
String line = iterator.nextSentence();
|
|
if (line != null)
|
|
buffer.add((this.preProcessor == null) ? line : this.preProcessor.preProcess(line));
|
|
} finally {
|
|
lock.writeLock().unlock();
|
|
}
|
|
cnt++;
|
|
}
|
|
// log.info("Lines added: [" + cnt + "], buffer size: [" + buffer.size() + "]");
|
|
} else
|
|
ThreadUtils.uncheckedSleep(10);
|
|
}
|
|
isRunning.set(false);
|
|
}
|
|
}
|
|
|
|
public String nextLine() {
|
|
if (!buffer.isEmpty())
|
|
return buffer.poll();
|
|
|
|
try {
|
|
return buffer.poll(2L, TimeUnit.SECONDS);
|
|
} catch (Exception e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public boolean hasMoreLines() {
|
|
if (!buffer.isEmpty())
|
|
return true;
|
|
|
|
try {
|
|
this.lock.readLock().lock();
|
|
return iterator.hasNext() || !buffer.isEmpty();
|
|
} finally {
|
|
this.lock.readLock().unlock();
|
|
}
|
|
}
|
|
|
|
public void reset() {
|
|
try {
|
|
lock.writeLock().lock();
|
|
buffer.clear();
|
|
iterator.reset();
|
|
} catch (Exception e) {
|
|
throw new RuntimeException(e);
|
|
} finally {
|
|
lock.writeLock().unlock();
|
|
}
|
|
}
|
|
|
|
public void terminate() {
|
|
shouldTerminate.set(true);
|
|
}
|
|
}
|
|
}
|