Copyright updates, removal of extra nlp modules

master
agibsonccc 2021-02-18 11:46:53 +09:00
parent 8bc3172e40
commit 1eaee7f6d9
277 changed files with 4347 additions and 620889 deletions

35
Jenkinsfile vendored
View File

@ -1,20 +1,21 @@
/*
* ******************************************************************************
* * Copyright (c) 2021 Deeplearning4j Contributors
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
/* ******************************************************************************
*
*
* This program and the accompanying materials are made available under the
* terms of the Apache License, Version 2.0 which is available at
* https://www.apache.org/licenses/LICENSE-2.0.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: Apache-2.0
******************************************************************************/
#!groovy

File diff suppressed because it is too large Load Diff

View File

@ -163,6 +163,12 @@
<artifactId>oshi-core</artifactId>
<version>${oshi.version}</version>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-native</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<profiles>

View File

@ -64,7 +64,7 @@ public class CNN1DGradientCheckTest extends BaseDL4JTest {
@Override
public long getTimeoutMilliseconds() {
return 90000L;
return 180000;
}
@Test

View File

@ -1,70 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ /* ******************************************************************************
~ *
~ *
~ * This program and the accompanying materials are made available under the
~ * terms of the Apache License, Version 2.0 which is available at
~ * https://www.apache.org/licenses/LICENSE-2.0.
~ *
~ * See the NOTICE file distributed with this work for additional
~ * information regarding copyright ownership.
~ * Unless required by applicable law or agreed to in writing, software
~ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
~ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
~ * License for the specific language governing permissions and limitations
~ * under the License.
~ *
~ * SPDX-License-Identifier: Apache-2.0
~ ******************************************************************************/
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp-parent</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp-chinese</artifactId>
<version>1.0.0-SNAPSHOT</version>
<properties>
<slf4j-api.version>1.6.4</slf4j-api.version>
<logback-classic.version>0.9.28</logback-classic.version>
<nlp-lang.version>1.7.2</nlp-lang.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.nlpcn</groupId>
<artifactId>nlp-lang</artifactId>
<version>${nlp-lang.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
<profiles>
<profile>
<id>test-nd4j-native</id>
</profile>
<profile>
<id>test-nd4j-cuda-11.0</id>
</profile>
</profiles>
</project>

View File

@ -1,251 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf;
import org.ansj.app.crf.pojo.Element;
import org.nlpcn.commons.lang.util.WordAlert;
import java.util.ArrayList;
import java.util.List;
public class Config {
public String splitStr = "\\s+";
public Config(int[][] template) {
this.template = template;
}
public static final int TAG_NUM = 4; // 标记类型写死了4个
// 特殊字符的标注
public static final char BEGIN = 128;
public static final char END = 129;
public static final char NUM_BEGIN = 130;
public static final char EN_BEGIN = 140;
public static final char FEATURE_BEGIN = 150;
public static char getNum(String str) {
if (str.length() > 9) {
return NUM_BEGIN;
} else {
return (char) (NUM_BEGIN + str.length());
}
}
public static char getEn(String str) {
if (str.length() > 9) {
return EN_BEGIN;
} else {
return (char) (EN_BEGIN + str.length());
}
}
// 字标注类型
public static int S = 0;
public static int B = 1;
public static int M = 2;
public static int E = 3;
private int[][] template = {{-2}, {-1}, {0}, {1}, {2}, {-2, -1}, {-1, 0}, {0, 1}, {1, 2}, {-1, 1}};
public int[][] getTemplate() {
return template;
}
public void setTemplate(int[][] template) {
this.template = template;
}
/**
*
*
* @param word
* @return
*/
public static List<Element> wordAlert(String word) {
char[] chars = WordAlert.alertStr(word);
List<Element> list = new ArrayList<>();
StringBuilder tempSb = new StringBuilder();
int status = 0; // 1 num 2 english
Element element = null;
for (int i = 0; i < chars.length; i++) {
if (chars[i] >= '0' && chars[i] <= '9') {
if (status == 2) {
element = new Element(Config.getNum(tempSb.toString()));
element.len = tempSb.length();
list.add(element);
tempSb = new StringBuilder();
}
tempSb.append(chars[i]);
status = 1;
} else if (chars[i] >= 'A' && chars[i] <= 'z') {
if (status == 1) {
element = new Element(Config.getEn(tempSb.toString()));
element.len = tempSb.length();
list.add(element);
tempSb = new StringBuilder();
}
tempSb.append(chars[i]);
status = 2;
} else {
if (status == 1) {
element = new Element(Config.getNum(tempSb.toString()));
element.len = tempSb.length();
list.add(element);
} else if (status == 2) {
element = new Element(Config.getEn(tempSb.toString()));
element.len = tempSb.length();
list.add(element);
}
tempSb = new StringBuilder();
list.add(new Element(chars[i]));
status = 0;
}
}
if (tempSb.length() > 0) {
if (status == 1) {
element = new Element(Config.getNum(tempSb.toString()));
element.len = tempSb.length();
list.add(element);
} else if (status == 2) {
element = new Element(Config.getEn(tempSb.toString()));
element.len = tempSb.length();
list.add(element);
} else {
System.out.println("err!");
}
}
return list;
}
/**
* @param temp
* @return
*/
public static List<Element> makeToElementList(String temp, String splitStr) {
String[] split = temp.split(splitStr);
List<Element> list = new ArrayList<>(temp.length());
for (String word : split) {
List<Element> wordAlert = wordAlert(word);
int len = wordAlert.size();
if (len == 1) {
wordAlert.get(0).updateTag(Config.S);
} else if (len == 2) {
wordAlert.get(0).updateTag(Config.B);
wordAlert.get(1).updateTag(Config.E);
} else if (len > 2) {
wordAlert.get(0).updateTag(Config.B);
for (int i = 1; i < len - 1; i++) {
wordAlert.get(i).updateTag(Config.M);
}
wordAlert.get(len - 1).updateTag(Config.E);
}
list.addAll(wordAlert);
}
return list;
}
public List<Element> makeToElementList(String temp) {
return wordAlert(temp);
}
public char getNameIfOutArr(List<Element> list, int index) {
if (index < 0) {
return Config.BEGIN;
} else if (index >= list.size()) {
return Config.END;
} else {
return list.get(index).name;
}
}
public char getTagIfOutArr(List<Element> list, int index) {
if (index < 0 || index >= list.size()) {
return 0;
} else {
return (char) list.get(index).getTag();
}
}
/**
*
*
* @param list
* @param index
* @return KeyValue(,featureLength*tagNum)
*/
public char[][] makeFeatureArr(List<Element> list, int index) {
char[][] result = new char[template.length][];
char[] chars = null;
int len = 0;
int i = 0;
for (; i < template.length; i++) {
if (template[i].length == 0) {
continue;
}
chars = new char[template[i].length + 1];
len = chars.length - 1;
for (int j = 0; j < len; j++) {
chars[j] = getNameIfOutArr(list, index + template[i][j]);
}
chars[len] = (char) (FEATURE_BEGIN + i);
result[i] = chars;
}
return result;
}
public static char getTagName(int tag) {
switch (tag) {
case 0:
return 'S';
case 1:
return 'B';
case 2:
return 'M';
case 3:
return 'E';
default:
return '?';
}
}
}

View File

@ -1,81 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf;
import org.ansj.app.crf.pojo.Element;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
public class MakeTrainFile {
private static final Log logger = LogFactory.getLog();
public static void main(String[] args) {
String inputPath = "corpus.txt";
String outputPath = "train.txt";
if (args != null && args.length == 2) {
inputPath = args[0];
outputPath = args[1];
}
if (StringUtil.isBlank(inputPath) || StringUtil.isBlank(outputPath)) {
logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]");
return;
}
try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8");
FileOutputStream fos = new FileOutputStream(outputPath)) {
String temp = null;
int i = 0;
while ((temp = reader.readLine()) != null) {
StringBuilder sb = new StringBuilder("\n");
if (StringUtil.isBlank(temp)) {
continue;
}
if (i == 0) {
temp = StringUtil.trim(temp);
}
List<Element> list = Config.makeToElementList(temp, "\\s+");
for (Element element : list) {
sb.append(element.nameStr() + " " + Config.getTagName(element.getTag()));
sb.append("\n");
}
fos.write(sb.toString().getBytes(IOUtil.UTF8));
System.out.println(++i);
}
} catch (FileNotFoundException e) {
logger.warn("文件没有找到", e);
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
}

View File

@ -1,196 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf;
import org.ansj.app.crf.model.CRFModel;
import org.ansj.app.crf.model.CRFppTxtModel;
import org.ansj.app.crf.model.WapitiCRFModel;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.MapCount;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.*;
import java.util.Map;
import java.util.Map.Entry;
import java.util.zip.GZIPOutputStream;
public abstract class Model {
public static final Log logger = LogFactory.getLog(Model.class);
protected Config config;
protected SmartForest<float[]> featureTree = null;
protected float[][] status = new float[Config.TAG_NUM][Config.TAG_NUM];
public int allFeatureCount = 0;
/**
*
*
* @param is
* @return
*/
public abstract boolean checkModel(String modelPath) throws IOException;
/**
*
*
* @param path
* @return
* @return
* @throws Exception
*/
public static Model load(String modelPath) throws Exception {
Model model = new CRFModel();
if (model.checkModel(modelPath)) {
return model.loadModel(modelPath);
}
model = new CRFppTxtModel();
if (model.checkModel(modelPath)) {
return model.loadModel(modelPath);
}
model = new WapitiCRFModel();
if (model.checkModel(modelPath)) {
return model.loadModel(modelPath);
}
throw new Exception("I did not know what type of model by file " + modelPath);
}
/**
*
*
*/
public static Model load(Class<? extends Model> modelClass, InputStream inputStream) throws Exception {
return modelClass
.getDeclaredConstructor()
.newInstance()
.loadModel(inputStream);
}
/**
*
*
* @throws Exception
*/
public abstract Model loadModel(String modelPath) throws Exception;
public abstract Model loadModel(InputStream is) throws Exception;
/**
*
*
* @param featureStr
* @return
*/
public float[] getFeature(char... chars) {
if (chars == null) {
return null;
}
SmartForest<float[]> sf = featureTree;
sf = sf.getBranch(chars);
if (sf == null || sf.getParam() == null) {
return null;
}
return sf.getParam();
}
public Config getConfig() {
return this.config;
}
/**
* tag
*
* @param s1
* @param s2
* @return
*/
public float tagRate(int s1, int s2) {
return status[s1][s2];
}
/**
*
*
* @param cs
* @param tempW
*/
protected static void printFeatureTree(String cs, float[] tempW) {
String name = "*";
if (tempW.length == 4) {
name = "U";
}
name += "*" + (cs.charAt(cs.length() - 1) - Config.FEATURE_BEGIN + 1) + ":" + cs.substring(0, cs.length() - 1);
for (int i = 0; i < tempW.length; i++) {
if (tempW[i] != 0) {
System.out.println(name + "\t" + Config.getTagName(i / 4 - 1) + "\t" + Config.getTagName(i % 4) + "\t"
+ tempW[i]);
}
}
}
/**
* model
*
* @param path
* @throws IOException
* @throws FileNotFoundException
*/
public void writeModel(String path) {
try (FileOutputStream fso = new FileOutputStream(path)) {
ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(fso));
oos.writeUTF(CRFModel.VERSION);
oos.writeObject(status);
oos.writeObject(config.getTemplate());
Map<String, float[]> map = featureTree.toMap();
MapCount<Integer> mc = new MapCount<>();
for (float[] v : map.values()) {
mc.add(v.length);
}
for (Entry<Integer, Double> entry : mc.get().entrySet()) {
int win = entry.getKey();
oos.writeInt(win);// 宽度
oos.writeInt(entry.getValue().intValue());// 个数
for (Entry<String, float[]> e : map.entrySet()) {
if (e.getValue().length == win) {
oos.writeUTF(e.getKey());
float[] value = e.getValue();
for (int i = 0; i < win; i++) {
oos.writeFloat(value[i]);
}
}
}
}
oos.writeInt(0);
oos.writeInt(0);
oos.flush();
} catch (FileNotFoundException e) {
logger.warn("文件没有找到", e);
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
}

View File

@ -1,192 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf;
import org.ansj.app.crf.pojo.Element;
import org.ansj.util.MatrixUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class SplitWord {
private Model model = null;
public SplitWord(Model model) {
this.model = model;
};
public List<String> cut(char[] chars) {
return cut(new String(chars));
}
public List<String> cut(String line) {
if (StringUtil.isBlank(line)) {
return Collections.emptyList();
}
List<Element> elements = vterbi(line);
List<String> result = new ArrayList<>();
Element e = null;
int begin = 0;
int end = 0;
int size = elements.size() - 1;
for (int i = 0; i < elements.size(); i++) {
e = elements.get(i);
switch (e.getTag()) {
case 0:
end += e.len;
result.add(line.substring(begin, end));
begin = end;
break;
case 1:
end += e.len;
while (i < size && (e = elements.get(++i)).getTag() != 3) {
end += e.len;
}
end += e.len;
result.add(line.substring(begin, end));
begin = end;
default:
break;
}
}
return result;
}
private List<Element> vterbi(String line) {
List<Element> elements = Config.wordAlert(line);
int length = elements.size();
if (length == 0) { // 避免空list下面get(0)操作越界
return elements;
}
if (length == 1) {
elements.get(0).updateTag(0);
return elements;
}
/**
*
*/
for (int i = 0; i < length; i++) {
computeTagScore(elements, i);
}
// 如果是开始不可能从 me开始 ,所以将它设为一个很小的值
elements.get(0).tagScore[2] = -1000;
elements.get(0).tagScore[3] = -1000;
for (int i = 1; i < length; i++) {
elements.get(i).maxFrom(model, elements.get(i - 1));
}
// 末位置只能从S,E开始
// 末位置只能从0,3开始
Element next = elements.get(elements.size() - 1);
Element self = null;
int maxStatus = next.tagScore[0] > next.tagScore[3] ? 0 : 3;
next.updateTag(maxStatus);
maxStatus = next.from[maxStatus];
// 逆序寻找
for (int i = elements.size() - 2; i > 0; i--) {
self = elements.get(i);
self.updateTag(maxStatus);
maxStatus = self.from[self.getTag()];
next = self;
}
elements.get(0).updateTag(maxStatus);
// printElements(elements) ;
return elements;
}
private void computeTagScore(List<Element> elements, int index) {
char[][] feautres = model.getConfig().makeFeatureArr(elements, index);
//TODO: set 20 很大吧!
float[] tagScore = new float[20]; //Config.TAG_NUM*Config.TAG_NUM+Config.TAG_NUM
for (int i = 0; i < feautres.length; i++) {
MatrixUtil.dot(tagScore, model.getFeature(feautres[i]));
}
elements.get(index).tagScore = tagScore;
}
/**
* 便
*
* @param word
*/
public float cohesion(String word) {
if (word.length() == 0) {
return Integer.MIN_VALUE;
}
List<Element> elements = Config.wordAlert(word);
for (int i = 0; i < elements.size(); i++) {
computeTagScore(elements, i);
}
float value = elements.get(0).tagScore[1];
int len = elements.size() - 1;
for (int i = 1; i < len; i++) {
value += elements.get(i).tagScore[2];
}
value += elements.get(len).tagScore[3];
if (value < 0) {
return 1;
} else {
value += 1;
}
return value;
}
}

View File

@ -1,92 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf.model;
import org.ansj.app.crf.Config;
import org.ansj.app.crf.Model;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.IOUtil;
import java.io.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipException;
public class CRFModel extends Model {
public static final String VERSION = "ansj1";
@Override
public CRFModel loadModel(String modelPath) throws Exception {
try (InputStream is = IOUtil.getInputStream(modelPath)) {
loadModel(is);
return this;
}
}
@Override
public CRFModel loadModel(InputStream is) throws Exception {
long start = System.currentTimeMillis();
try (ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(is))) {
ois.readUTF();
this.status = (float[][]) ois.readObject();
int[][] template = (int[][]) ois.readObject();
this.config = new Config(template);
int win = 0;
int size = 0;
String name = null;
featureTree = new SmartForest<float[]>();
float[] value = null;
do {
win = ois.readInt();
size = ois.readInt();
for (int i = 0; i < size; i++) {
name = ois.readUTF();
value = new float[win];
for (int j = 0; j < value.length; j++) {
value[j] = ois.readFloat();
}
featureTree.add(name, value);
}
} while (win == 0 || size == 0);
logger.info("load crf model ok ! use time :" + (System.currentTimeMillis() - start));
}
return this;
}
@Override
public boolean checkModel(String modelPath) {
try (FileInputStream fis = new FileInputStream(modelPath)) {
ObjectInputStream inputStream = new ObjectInputStream(new GZIPInputStream(fis));
String version = inputStream.readUTF();
if (version.equals("ansj1")) { // 加载ansj,model
return true;
}
} catch (ZipException ze) {
logger.warn("解压异常", ze);
} catch (FileNotFoundException e) {
logger.warn("文件没有找到", e);
} catch (IOException e) {
logger.warn("IO异常", e);
}
return false;
}
}

View File

@ -1,332 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf.model;
import org.ansj.app.crf.Config;
import org.ansj.app.crf.Model;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.ObjConver;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.tuples.Pair;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
public class CRFppTxtModel extends Model {
/**
* crf++txt
*
* @return
*/
@Override
public CRFppTxtModel loadModel(String modelPath) throws Exception {
try (InputStream is = new FileInputStream(modelPath)) {
loadModel(new FileInputStream(modelPath));
return this;
}
}
@Override
public Model loadModel(InputStream is) throws Exception {
long start = System.currentTimeMillis();
BufferedReader reader = IOUtil.getReader(is, IOUtil.UTF8);
reader.readLine();// version
reader.readLine();// cost-factor
// int maxId =
// Integer.parseInt(reader.readLine().split(":")[1].trim());// read
reader.readLine();// xsize
reader.readLine(); // line
int[] statusCoven = loadTagCoven(reader);
Map<String, Integer> featureIndex = loadConfig(reader);
StringBuilder sb = new StringBuilder();
for (int[] t1 : config.getTemplate()) {
sb.append(Arrays.toString(t1) + " ");
}
logger.info("load template ok template : " + sb);
TreeMap<Integer, Pair<String, String>> featureNames = loadFeatureName(featureIndex, reader);
logger.info("load feature ok feature size : " + featureNames.size());
loadFeatureWeight(reader, statusCoven, featureNames);
logger.info("load crfpp model ok ! use time : " + (System.currentTimeMillis() - start));
return this;
}
/**
* //11:*6:_x-1/的,
*
* @param maxId
*
* @param featureIndex
*
* @param br
* @return
* @throws Exception
*/
private TreeMap<Integer, Pair<String, String>> loadFeatureName(Map<String, Integer> featureIndex, BufferedReader br)
throws Exception {
TreeMap<Integer, Pair<String, String>> featureNames = new TreeMap<>();
String temp = null;
while (StringUtil.isNotBlank(temp = br.readLine())) {
int indexOf = temp.indexOf(" ");
int id = ObjConver.getIntValue(temp.substring(0, indexOf));
if (indexOf > 0) {
temp = temp.substring(indexOf);
}
String[] split = temp.split(":");
if (split.length == 1) {
featureNames.put(id, Pair.with(temp.trim(), ""));
} else {
String name = split[1];
if (split.length > 2) {
for (int j = 2; j < split.length; j++) {
name += ":" + split[j];
}
}
int lastFeatureId = featureIndex.get(split[0].trim());
if ("/".equals(name)) {
name = "//";
}
if (name.contains("//")) {
name = name.replaceAll("//", "/XIEGANG/");
}
String featureName = toFeatureName(name.trim().split("/"), lastFeatureId);
featureNames.put(id, Pair.with(split[0].trim(), featureName));
}
}
return featureNames;
}
private String toFeatureName(String[] split, int lastFeatureId) throws Exception {
StringBuilder result = new StringBuilder();
for (String str : split) {
if ("".equals(str)) {
continue;
} else if (str.length() == 1) {
result.append(str.charAt(0));
} else if (str.equals("XIEGANG")) {
result.append('/');
} else if (str.startsWith("num")) {
result.append((char) (Config.NUM_BEGIN + ObjConver.getIntValue(str.replace("num", ""))));
} else if (str.startsWith("en")) {
result.append((char) (Config.EN_BEGIN + ObjConver.getIntValue(str.replace("en", ""))));
} else if (str.startsWith("_B-")) {
result.append(Config.BEGIN);
} else if (str.startsWith("_B+")) {
result.append(Config.END);
} else {
throw new Exception("can find feature named " + str + " in " + Arrays.toString(split));
}
}
result.append((char) (lastFeatureId + Config.FEATURE_BEGIN));
return result.toString();
}
/**
*
*
* @param br
* @param featureNames
* @param statusCoven
* @throws Exception
*/
private void loadFeatureWeight(BufferedReader br, int[] statusCoven,
TreeMap<Integer, Pair<String, String>> featureNames) throws Exception {
featureTree = new SmartForest<float[]>();
int tag = 0; // 赏析按标签为用来转换
int len = 0; // 权重数组的大小
String name = null; // 特征名称
float[] tempW = null; // 每一个特征的权重
String temp = null;
for (Pair<String, String> pair : featureNames.values()) {
char fc = Character.toUpperCase(pair.getValue0().charAt(0));
len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM
: fc == 'U' ? Config.TAG_NUM
: fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0;
if (len == 0) {
throw new Exception("unknow feature type " + pair.getValue0());
}
if (fc == 'B') { // 特殊处理转换特征数组
for (int i = 0; i < len; i++) {
temp = br.readLine();
int from = statusCoven[i / Config.TAG_NUM];
int to = statusCoven[i % Config.TAG_NUM];
status[from][to] = ObjConver.getFloatValue(temp);
}
} else {
name = pair.getValue1();
tempW = new float[len];
for (int i = 0; i < len; i++) {
temp = br.readLine();
tag = statusCoven[i];
tempW[tag] = ObjConver.getFloatValue(temp);
}
this.featureTree.add(name, tempW); // 将特征增加到特征🌲中
// printFeatureTree(name, tempW);
}
}
}
/**
*
*
* @param br
* @return
* @throws Exception
*/
private int[] loadTagCoven(BufferedReader br) throws Exception {
int[] conver = new int[Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM];
String temp = null;
// TODO: 这个是个写死的过程,如果标签发生改变需要重新来写这里
for (int i = 0; i < Config.TAG_NUM; i++) {
String line = br.readLine();
if (StringUtil.isBlank(line)) {
i--;
continue;
}
char c = line.charAt(0);
switch (c) {
case 'S':
conver[i] = Config.S;
break;
case 'B':
conver[i] = Config.B;
break;
case 'M':
conver[i] = Config.M;
break;
case 'E':
conver[i] = Config.E;
break;
default:
throw new Exception("err tag named " + c + " in model " + temp);
}
}
for (int i = Config.TAG_NUM; i < conver.length; i++) {
conver[i] = conver[(i - 4) / Config.TAG_NUM] * Config.TAG_NUM + conver[i % Config.TAG_NUM] + Config.TAG_NUM;
}
return conver;
}
private Map<String, Integer> loadConfig(BufferedReader br) throws IOException {
Map<String, Integer> featureIndex = new HashMap<>();
String temp = br.readLine();// #rdr#8/0/0
List<int[]> list = new ArrayList<>();
while (StringUtil.isNotBlank((temp = br.readLine()))) {
List<String> matcherAll = StringUtil.matcherAll("\\[.*?\\]", temp);
if (matcherAll.isEmpty()) {
continue;
}
int[] is = new int[matcherAll.size()];
for (int j = 0; j < is.length; j++) {
is[j] = ObjConver.getIntValue(StringUtil.matcherFirst("[-\\d]+", matcherAll.get(j)));
}
featureIndex.put(temp.split(":")[0].trim(), list.size());
list.add(is);
}
int[][] template = new int[list.size()][0]; // 构建特征模板
for (int i = 0; i < template.length; i++) {
template[i] = list.get(i);
}
config = new Config(template);
return featureIndex;
}
@Override
public boolean checkModel(String modelPath) {
try (InputStream is = IOUtil.getInputStream(modelPath)) {
byte[] bytes = new byte[100];
is.read(bytes);
String string = new String(bytes);
if (string.startsWith("version")) { // 加载crf++ 的txt类型的modle
return true;
}
} catch (IOException e) {
logger.warn("IO异常", e);
}
return false;
}
}

View File

@ -1,360 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf.model;
import org.ansj.app.crf.Config;
import org.ansj.app.crf.Model;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.ObjConver;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.tuples.Pair;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
public class WapitiCRFModel extends Model {
@Override
public WapitiCRFModel loadModel(String modelPath) throws Exception {
try (InputStream is = IOUtil.getInputStream(modelPath)) {
return loadModel(is);
}
}
@Override
public WapitiCRFModel loadModel(InputStream is) throws Exception {
BufferedReader br = IOUtil.getReader(is, IOUtil.UTF8);
long start = System.currentTimeMillis();
logger.info("load wapiti model begin!");
String temp = br.readLine();
logger.info(temp); // #mdl#2#123
Map<String, Integer> featureIndex = loadConfig(br);
StringBuilder sb = new StringBuilder();
for (int[] t1 : config.getTemplate()) {
sb.append(Arrays.toString(t1) + " ");
}
logger.info("featureIndex is " + featureIndex);
logger.info("load template ok template : " + sb);
int[] statusCoven = loadTagCoven(br);
List<Pair<String, String>> loadFeatureName = loadFeatureName(featureIndex, br);
logger.info("load feature ok feature size : " + loadFeatureName.size());
featureTree = new SmartForest<float[]>();
loadFeatureWeight(br, statusCoven, loadFeatureName);
logger.info("load wapiti model ok ! use time :" + (System.currentTimeMillis() - start));
return this;
}
/**
*
*
* @param br
* @param featureNames
* @param statusCoven
* @throws Exception
*/
private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List<Pair<String, String>> featureNames)
throws Exception {
int key = 0;
int offe = 0;
int tag = 0; // 赏析按标签为用来转换
int len = 0; // 权重数组的大小
int min, max = 0; // 设置边界
String name = null; // 特征名称
float[] tempW = null; // 每一个特征的权重
String temp = br.readLine();
for (Pair<String, String> pair : featureNames) {
if (temp == null) {
logger.warn(pair.getValue0() + "\t" + pair.getValue1() + " not have any weight ,so skip it !");
continue;
}
char fc = Character.toUpperCase(pair.getValue0().charAt(0));
len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM
: fc == 'U' ? Config.TAG_NUM
: fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0;
if (len == 0) {
throw new Exception("unknow feature type " + pair.getValue0());
}
min = max;
max += len;
if (fc == 'B') { // 特殊处理转换特征数组
for (int i = 0; i < len; i++) {
String[] split = temp.split("=");
int from = statusCoven[i / Config.TAG_NUM];
int to = statusCoven[i % Config.TAG_NUM];
status[from][to] = ObjConver.getFloatValue(split[1]);
temp = br.readLine();
}
} else {
name = pair.getValue1();
tempW = new float[len];
do {
String[] split = temp.split("=");
key = ObjConver.getIntValue(split[0]);
if (key >= max) { // 如果超过边界那么跳出
break;
}
offe = key - min;
tag = statusCoven[offe];
tempW[tag] = ObjConver.getFloatValue(split[1]);
} while ((temp = br.readLine()) != null);
this.featureTree.add(name, tempW); // 将特征增加到特征🌲中
// printFeatureTree(name, tempW);
}
}
}
/**
* //11:*6:_x-1/的,
*
* @param featureIndex
*
* @param br
* @return
* @throws Exception
*/
private List<Pair<String, String>> loadFeatureName(Map<String, Integer> featureIndex, BufferedReader br)
throws Exception {
String temp = br.readLine();// #qrk#num
int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数
List<Pair<String, String>> featureNames = new ArrayList<>();
for (int i = 0; i < featureNum; i++) {
temp = br.readLine();
String[] split = temp.split(":");
if (split.length == 2) {
featureNames.add(Pair.with(split[1], ""));
continue;
} else {
String name = split[2];
if (split.length > 3) {
for (int j = 3; j < split.length; j++) {
name += ":" + split[j];
}
}
// 去掉最后的空格
name = name.substring(0, name.length() - 1);
int lastFeatureId = featureIndex.get(split[1]);
if ("/".equals(name)) {
name = "//";
}
if (name.contains("//")) {
name = name.replaceAll("//", "/XIEGANG/");
}
String featureName = toFeatureName(name.trim().split("/"), lastFeatureId);
featureNames.add(Pair.with(split[1], featureName));
}
}
return featureNames;
}
private String toFeatureName(String[] split, int lastFeatureId) throws Exception {
StringBuilder result = new StringBuilder();
for (String str : split) {
if ("".equals(str)) {
continue;
} else if (str.length() == 1) {
result.append(str.charAt(0));
} else if (str.equals("XIEGANG")) {
result.append('/');
} else if (str.startsWith("num")) {
result.append((char) (Config.NUM_BEGIN + ObjConver.getIntValue(str.replace("num", ""))));
} else if (str.startsWith("en")) {
result.append((char) (Config.EN_BEGIN + ObjConver.getIntValue(str.replace("en", ""))));
} else if (str.startsWith("_x-")) {
result.append(Config.BEGIN);
} else if (str.startsWith("_x+")) {
result.append(Config.END);
} else {
throw new Exception("can find feature named " + str + " in " + Arrays.toString(split));
}
}
result.append((char) (lastFeatureId + Config.FEATURE_BEGIN));
return result.toString();
}
/**
*
*
* @param br
* @return
* @throws Exception
*/
private int[] loadTagCoven(BufferedReader br) throws Exception {
int[] conver = new int[Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM];
String temp = br.readLine();// #qrk#4
// TODO: 这个是个写死的过程,如果标签发生改变需要重新来写这里
for (int i = 0; i < Config.TAG_NUM; i++) {
char c = br.readLine().split(":")[1].charAt(0);
switch (c) {
case 'S':
conver[i] = Config.S;
break;
case 'B':
conver[i] = Config.B;
break;
case 'M':
conver[i] = Config.M;
break;
case 'E':
conver[i] = Config.E;
break;
default:
throw new Exception("err tag named " + c + " in model " + temp);
}
}
for (int i = Config.TAG_NUM; i < conver.length; i++) {
conver[i] = conver[(i - 4) / Config.TAG_NUM] * Config.TAG_NUM + conver[i % Config.TAG_NUM] + Config.TAG_NUM;
}
return conver;
}
/**
*
*
* @param br
* @return
* @throws IOException
*/
private Map<String, Integer> loadConfig(BufferedReader br) throws IOException {
Map<String, Integer> featureIndex = new HashMap<>();
String temp = br.readLine();// #rdr#8/0/0
int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数
List<int[]> list = new ArrayList<>();
for (int i = 0; i < featureNum; i++) {
temp = br.readLine();
List<String> matcherAll = StringUtil.matcherAll("\\[.*?\\]", temp);
if (matcherAll.isEmpty()) {
continue;
}
int[] is = new int[matcherAll.size()];
for (int j = 0; j < is.length; j++) {
is[j] = ObjConver.getIntValue(StringUtil.matcherFirst("[-\\d]+", matcherAll.get(j)));
}
featureIndex.put(temp.split(":")[1], list.size());
list.add(is);
}
int[][] template = new int[list.size()][0]; // 构建特征模板
for (int i = 0; i < template.length; i++) {
template[i] = list.get(i);
}
config = new Config(template);
return featureIndex;
}
@Override
public boolean checkModel(String modelPath) {
try (InputStream is = IOUtil.getInputStream(modelPath)) {
byte[] bytes = new byte[100];
is.read(bytes);
String string = new String(bytes);
if (string.startsWith("#mdl#")) { // 加载crf++ 的txt类型的modle
return true;
}
} catch (IOException e) {
logger.warn("IO异常", e);
}
return false;
}
}

View File

@ -1,110 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.crf.pojo;
import org.ansj.app.crf.Config;
import org.ansj.app.crf.Model;
public class Element {
public char name;
private int tag = -1;
public int len = 1;
public String nature;
public float[] tagScore;
public int[] from;
public Element(char name) {
this.name = name;
}
public Element(Character name, int tag) {
this.name = name;
this.tag = tag;
}
public int getTag() {
return tag;
}
public Element updateTag(int tag) {
this.tag = tag;
return this;
}
public Element updateNature(String nature) {
this.nature = nature;
return this;
}
@Override
public String toString() {
return name + "/" + len + "/" + tag;
}
public char getName() {
return name;
}
/**
*
*
* @return
*/
public String nameStr() {
if (name >= 130 && name < 140) {
return ("num" + (name - 130));
} else if (name >= 140 && name < 150) {
return ("en" + (name - 140));
} else {
return String.valueOf(name);
}
}
public void maxFrom(Model model, Element element) {
if (from == null) {
from = new int[Config.TAG_NUM];
}
float[] pTagScore = element.tagScore;
for (int i = 0; i < Config.TAG_NUM; i++) {
float maxValue = 0;
for (int j = 0; j < Config.TAG_NUM; j++) {
float value = (pTagScore[j] + tagScore[i]) + model.tagRate(j, i);
if (tagScore.length > Config.TAG_NUM) {
value += tagScore[Config.TAG_NUM + j * Config.TAG_NUM + i];
}
if (value > maxValue) {
maxValue = value;
from[i] = j;
}
}
tagScore[i] = maxValue;
}
}
}

View File

@ -1,163 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.keyword;
import org.ansj.domain.Term;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.nlpcn.commons.lang.util.StringUtil;
import java.util.*;
public class KeyWordComputer<T extends Analysis> {
private static final Map<String, Double> POS_SCORE = new HashMap<>();
private T analysisType;
static {
POS_SCORE.put("null", 0.0);
POS_SCORE.put("w", 0.0);
POS_SCORE.put("en", 0.0);
POS_SCORE.put("m", 0.0);
POS_SCORE.put("num", 0.0);
POS_SCORE.put("nr", 3.0);
POS_SCORE.put("nrf", 3.0);
POS_SCORE.put("nw", 3.0);
POS_SCORE.put("nt", 3.0);
POS_SCORE.put("l", 0.2);
POS_SCORE.put("a", 0.2);
POS_SCORE.put("nz", 3.0);
POS_SCORE.put("v", 0.2);
POS_SCORE.put("kw", 6.0); //关键词词性
}
private int nKeyword = 5;
public KeyWordComputer() {}
public void setAnalysisType(T analysisType) {
this.analysisType = analysisType;
}
/**
*
*
* @param nKeyword
*/
public KeyWordComputer(int nKeyword) {
this.nKeyword = nKeyword;
this.analysisType = (T) new NlpAnalysis();//默认使用NLP的分词方式
}
public KeyWordComputer(int nKeyword, T analysisType) {
this.nKeyword = nKeyword;
this.analysisType = analysisType;
}
/**
* @param content
* @return
*/
private List<Keyword> computeArticleTfidf(String content, int titleLength) {
Map<String, Keyword> tm = new HashMap<>();
List<Term> parse = analysisType.parseStr(content).getTerms();
//FIXME: 这个依赖于用户自定义词典的词性,所以得需要另一个方法..
// parse = FilterModifWord.updateNature(parse) ;
for (Term term : parse) {
double weight = getWeight(term, content.length(), titleLength);
if (weight == 0)
continue;
Keyword keyword = tm.get(term.getName());
if (keyword == null) {
keyword = new Keyword(term.getName(), term.natrue().allFrequency, weight);
tm.put(term.getName(), keyword);
} else {
keyword.updateWeight(1);
}
}
TreeSet<Keyword> treeSet = new TreeSet<>(tm.values());
ArrayList<Keyword> arrayList = new ArrayList<>(treeSet);
if (treeSet.size() <= nKeyword) {
return arrayList;
} else {
return arrayList.subList(0, nKeyword);
}
}
/**
* @param title
* @param content
* @return
*/
public List<Keyword> computeArticleTfidf(String title, String content) {
if (StringUtil.isBlank(title)) {
title = "";
}
if (StringUtil.isBlank(content)) {
content = "";
}
return computeArticleTfidf(title + "\t" + content, title.length());
}
/**
*
*
* @param content
* @return
*/
public List<Keyword> computeArticleTfidf(String content) {
return computeArticleTfidf(content, 0);
}
private double getWeight(Term term, int length, int titleLength) {
if (term.getName().trim().length() < 2) {
return 0;
}
String pos = term.natrue().natureStr;
Double posScore = POS_SCORE.get(pos);
if (posScore == null) {
posScore = 1.0;
} else if (posScore == 0) {
return 0;
}
if (titleLength > term.getOffe()) {
return 5 * posScore;
}
return (length - term.getOffe()) * posScore / length;
}
}

View File

@ -1,94 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.keyword;
public class Keyword implements Comparable<Keyword> {
private String name;
private double score;
private double idf;
private int freq;
public Keyword(String name, int docFreq, double weight) {
this.name = name;
this.idf = Math.log(1 + 10000.0 / (docFreq + 1));
this.score = idf * weight;
freq++;
}
public Keyword(String name, double score) {
this.name = name;
this.score = score;
this.idf = score;
freq++;
}
public void updateWeight(int weight) {
this.score += weight * idf;
freq++;
}
public int getFreq() {
return freq;
}
@Override
public int compareTo(Keyword o) {
if (this.score < o.score) {
return 1;
} else {
return -1;
}
}
@Override
public boolean equals(Object obj) {
if (obj instanceof Keyword) {
Keyword k = (Keyword) obj;
return k.name.equals(name);
} else {
return false;
}
}
@Override
public String toString() {
return name + "/" + score;// "="+score+":"+freq+":"+idf;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getScore() {
return score;
}
public void setScore(double score) {
this.score = score;
}
}

View File

@ -1,25 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
/**
* @author
*
*/
package org.ansj.app.keyword;

View File

@ -1,332 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.summary;
import org.ansj.app.keyword.KeyWordComputer;
import org.ansj.app.keyword.Keyword;
import org.ansj.app.summary.pojo.Summary;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.nlpcn.commons.lang.tire.SmartGetWord;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.MapCount;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* ,
*
* @author ansj
*
*/
public class SummaryComputer {
private static final Set<String> FILTER_SET = new HashSet<>();
static {
FILTER_SET.add("w");
FILTER_SET.add("null");
}
/**
* summaryLength
*/
private int len = 300;
private boolean isSplitSummary = true;
String title, content;
public SummaryComputer(String title, String content) {
this.title = title;
this.content = content;
}
public SummaryComputer(int len, String title, String content) {
this.len = len;
this.title = title;
this.content = content;
}
public SummaryComputer(int len, boolean isSplitSummary, String title, String content) {
this.len = len;
this.title = title;
this.content = content;
this.isSplitSummary = isSplitSummary;
}
/**
*
*
* @return
*/
public Summary toSummary() {
return toSummary(new ArrayList<Keyword>());
}
/**
*
*
* @return
*/
public Summary toSummary(String query) {
List<Term> parse = NlpAnalysis.parse(query).getTerms();
List<Keyword> keywords = new ArrayList<>();
for (Term term : parse) {
if (FILTER_SET.contains(term.natrue().natureStr)) {
continue;
}
keywords.add(new Keyword(term.getName(), term.termNatures().allFreq, 1));
}
return toSummary(keywords);
}
/**
*
*
* @return
*/
public Summary toSummary(List<Keyword> keywords) {
if (keywords == null) {
keywords = new ArrayList<>();
}
if (keywords.isEmpty()) {
KeyWordComputer kc = new KeyWordComputer(10);
keywords = kc.computeArticleTfidf(title, content);
}
return explan(keywords, content);
}
/**
*
*
* @param keyword
* @param content
* @return
*/
private Summary explan(List<Keyword> keywords, String content) {
SmartForest<Double> sf = new SmartForest<>();
for (Keyword keyword : keywords) {
sf.add(keyword.getName(), keyword.getScore());
}
// 先断句
List<Sentence> sentences = toSentenceList(content.toCharArray());
for (Sentence sentence : sentences) {
computeScore(sentence, sf);
}
double maxScore = 0;
int maxIndex = 0;
MapCount<String> mc = new MapCount<>();
for (int i = 0; i < sentences.size(); i++) {
double tempScore = sentences.get(i).score;
int tempLength = sentences.get(i).value.length();
mc.addAll(sentences.get(i).mc.get());
if (tempLength >= len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
continue;
}
mc.get().clear();
}
for (int j = i + 1; j < sentences.size(); j++) {
tempScore += sentences.get(j).score;
tempLength += sentences.get(j).value.length();
mc.addAll(sentences.get(j).mc.get());
if (tempLength >= len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
}
mc.get().clear();
break;
}
}
if (tempLength < len) {
tempScore = tempScore * mc.get().size();
if (maxScore < tempScore) {
maxScore = tempScore;
maxIndex = i;
break;
}
mc.get().clear();
}
}
StringBuilder sb = new StringBuilder();
for (int i = maxIndex; i < sentences.size(); i++) {
sb.append(sentences.get(i).value);
if (sb.length() > len) {
break;
}
}
String summaryStr = sb.toString();
/**
* abc
*/
if (isSplitSummary && sb.length() > len) {
double value = len;
StringBuilder newSummary = new StringBuilder();
char c = 0;
for (int i = 0; i < sb.length(); i++) {
c = sb.charAt(i);
if (c < 256) {
value -= 0.5;
} else {
value -= 1;
}
if (value < 0) {
break;
}
newSummary.append(c);
}
summaryStr = newSummary.toString();
}
return new Summary(keywords, summaryStr);
}
/**
*
*
* @param sentence
* @param sf
*/
private void computeScore(Sentence sentence, SmartForest<Double> forest) {
SmartGetWord<Double> sgw = new SmartGetWord<>(forest, sentence.value);
String name = null;
while ((name = sgw.getFrontWords()) != null) {
sentence.updateScore(name, sgw.getParam());
}
if (sentence.score == 0) {
sentence.score = sentence.value.length() * -0.005;
} else {
sentence.score /= Math.log(sentence.value.length() + 3);
}
}
public List<Sentence> toSentenceList(char[] chars) {
StringBuilder sb = new StringBuilder();
List<Sentence> sentences = new ArrayList<>();
for (int i = 0; i < chars.length; i++) {
if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
continue;
}
sb.append(chars[i]);
switch (chars[i]) {
case '.':
if (i < chars.length - 1 && chars[i + 1] > 128) {
insertIntoList(sb, sentences);
sb = new StringBuilder();
}
break;
//case ' ':
case ' ':
case ' ':
case ' ':
case ',':
case '。':
case ';':
case '':
case '!':
case '':
case '':
case '?':
case '':
case '\n':
case '\r':
insertIntoList(sb, sentences);
sb = new StringBuilder();
}
}
if (sb.length() > 0) {
insertIntoList(sb, sentences);
}
return sentences;
}
private void insertIntoList(StringBuilder sb, List<Sentence> sentences) {
String content = sb.toString().trim();
if (content.length() > 0) {
sentences.add(new Sentence(content));
}
}
/*
*
*/
public class Sentence {
String value;
private double score;
private MapCount<String> mc = new MapCount<>();
public Sentence(String value) {
this.value = value.trim();
}
public void updateScore(String name, double score) {
mc.add(name);
Double size = mc.get().get(name);
this.score += score / size;
}
@Override
public String toString() {
return value;
}
}
}

View File

@ -1,75 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.summary;
import org.ansj.app.keyword.Keyword;
import org.ansj.app.summary.pojo.Summary;
import org.nlpcn.commons.lang.tire.SmartGetWord;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class TagContent {
private String beginTag, endTag;
public TagContent(String beginTag, String endTag) {
this.beginTag = beginTag;
this.endTag = endTag;
}
public String tagContent(Summary summary) {
return tagContent(summary.getKeyWords(), summary.getSummary());
}
public String tagContent(List<Keyword> keyWords, String content) {
SmartForest<Double> sf = new SmartForest<>();
for (Keyword keyWord : keyWords) {
sf.add(keyWord.getName().toLowerCase(), keyWord.getScore());
}
SmartGetWord<Double> sgw = new SmartGetWord<>(sf, content.toLowerCase());
int beginOffe = 0;
String temp = null;
StringBuilder sb = new StringBuilder();
while ((temp = sgw.getFrontWords()) != null) {
sb.append(content.substring(beginOffe, sgw.offe));
sb.append(beginTag);
sb.append(content.substring(sgw.offe, sgw.offe + temp.length()));
sb.append(endTag);
beginOffe = sgw.offe + temp.length();
}
if (beginOffe <= content.length() - 1) {
sb.append(content.substring(beginOffe, content.length()));
}
return sb.toString();
}
}

View File

@ -1,58 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.app.summary.pojo;
import org.ansj.app.keyword.Keyword;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class Summary {
/**
*
*/
private List<Keyword> keyWords = null;
/**
*
*/
private String summary;
public Summary(List<Keyword> keyWords, String summary) {
this.keyWords = keyWords;
this.summary = summary;
}
public List<Keyword> getKeyWords() {
return keyWords;
}
public String getSummary() {
return summary;
}
}

View File

@ -1,56 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
/**
*
*
* @author ansj
*/
public class DicReader {
private static final Log logger = LogFactory.getLog();
public static BufferedReader getReader(String name) {
// maven工程修改词典加载方式
InputStream in = DicReader.class.getResourceAsStream("/" + name);
try {
return new BufferedReader(new InputStreamReader(in, "UTF-8"));
} catch (UnsupportedEncodingException e) {
logger.warn("不支持的编码", e);
}
return null;
}
public static InputStream getInputStream(String name) {
// maven工程修改词典加载方式
InputStream in = DicReader.class.getResourceAsStream("/" + name);
return in;
}
}

View File

@ -1,207 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic;
import org.ansj.app.crf.SplitWord;
import org.ansj.domain.Nature;
import org.ansj.domain.NewWord;
import org.ansj.domain.TermNatures;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.impl.NatureRecognition;
import org.ansj.util.Graph;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.CollectionUtil;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
/**
* ,线.
*
* @author ansj
*
*/
public class LearnTool {
private SplitWord splitWord = null;
/**
*
*/
public boolean isAsianName = true;
public boolean isForeignName = true;
/**
*
*/
public int count;
/**
* ...
*/
private final SmartForest<NewWord> sf = new SmartForest<>();
/**
*
*/
private Forest[] forests;
/**
* .
*
* @param graph
*/
public void learn(Graph graph, SplitWord splitWord, Forest... forests) {
this.splitWord = splitWord;
this.forests = forests;
// 亚洲人名识别
if (isAsianName) {
findAsianPerson(graph);
}
// 外国人名识别
if (isForeignName) {
findForeignPerson(graph);
}
}
private void findAsianPerson(Graph graph) {
List<NewWord> newWords = new AsianPersonRecognition().getNewWords(graph.terms);
addListToTerm(newWords);
}
private void findForeignPerson(Graph graph) {
List<NewWord> newWords = new ForeignPersonRecognition().getNewWords(graph.terms);
addListToTerm(newWords);
}
// 批量将新词加入到词典中
private void addListToTerm(List<NewWord> newWords) {
if (newWords.isEmpty())
return;
for (NewWord newWord : newWords) {
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(newWord.getName());
if (termNatures == TermNatures.NULL) {
addTerm(newWord);
}
}
}
/**
*
*
* @param newWord
*/
public void addTerm(NewWord newWord) {
NewWord temp = null;
SmartForest<NewWord> smartForest = null;
if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
temp = smartForest.getParam();
temp.update(newWord.getNature(), newWord.getAllFreq());
} else {
count++;
if (splitWord == null) {
newWord.setScore(-1);
} else {
newWord.setScore(-splitWord.cohesion(newWord.getName()));
}
synchronized (sf) {
sf.add(newWord.getName(), newWord);
}
}
}
public SmartForest<NewWord> getForest() {
return this.sf;
}
/**
* .
*
* @param num .0
* @return
*/
public List<Entry<String, Double>> getTopTree(int num) {
return getTopTree(num, null);
}
public List<Entry<String, Double>> getTopTree(int num, Nature nature) {
if (sf.branches == null) {
return null;
}
HashMap<String, Double> hm = new HashMap<>();
for (int i = 0; i < sf.branches.length; i++) {
valueResult(sf.branches[i], hm, nature);
}
List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
if (num == 0) {
return sortMapByValue;
} else {
num = Math.min(num, sortMapByValue.size());
return sortMapByValue.subList(0, num);
}
}
private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, Nature nature) {
if (smartForest == null || smartForest.branches == null) {
return;
}
for (int i = 0; i < smartForest.branches.length; i++) {
NewWord param = smartForest.branches[i].getParam();
if (smartForest.branches[i].getStatus() == 3) {
if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
hm.put(param.getName(), param.getScore());
}
} else if (smartForest.branches[i].getStatus() == 2) {
if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
hm.put(param.getName(), param.getScore());
}
valueResult(smartForest.branches[i], hm, nature);
} else {
valueResult(smartForest.branches[i], hm, nature);
}
}
}
/**
*
*
* @param name
*/
public void active(String name) {
SmartForest<NewWord> branch = sf.getBranch(name);
if (branch != null && branch.getParam() != null) {
branch.getParam().setActive(true);
}
}
}

View File

@ -1,66 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic;
import org.ansj.dic.impl.File2Stream;
import org.ansj.dic.impl.Jar2Stream;
import org.ansj.dic.impl.Jdbc2Stream;
import org.ansj.dic.impl.Url2Stream;
import org.ansj.exception.LibraryException;
import org.deeplearning4j.common.config.DL4JClassLoading;
import java.io.InputStream;
public abstract class PathToStream {
public static InputStream stream(String path) {
try {
if (path.startsWith("file://")) {
return new File2Stream().toStream(path);
} else if (path.startsWith("jdbc://")) {
return new Jdbc2Stream().toStream(path);
} else if (path.startsWith("jar://")) {
return new Jar2Stream().toStream(path);
} else if (path.startsWith("class://")) {
// Probably unused
return loadClass(path);
} else if (path.startsWith("http://") || path.startsWith("https://")) {
return new Url2Stream().toStream(path);
} else {
return new File2Stream().toStream(path);
}
} catch (Exception e) {
throw new LibraryException(e);
}
}
public abstract InputStream toStream(String path);
static InputStream loadClass(String path) {
String className = path
.substring("class://".length())
.split("\\|")[0];
return DL4JClassLoading
.createNewInstance(className, PathToStream.class)
.toStream(path);
}
}

View File

@ -1,103 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic.impl;
import org.ansj.dic.PathToStream;
import org.ansj.exception.LibraryException;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.*;
import java.util.Vector;
public class File2Stream extends PathToStream {
private static final Log LOG = LogFactory.getLog(File2Stream.class);
@Override
public InputStream toStream(String path) {
LOG.info("path to stream " + path);
if (path.startsWith("file://")) {
path = path.substring(7);
}
File file = new File(path);
if (file.exists() && file.canRead()) {
try {
if (file.isDirectory()) {
return multiple(path);
} else {
return new FileInputStream(file);
}
} catch (Exception e) {
throw new LibraryException(e);
}
}
throw new LibraryException(
" path :" + path + " file:" + file.getAbsolutePath() + " not found or can not to read");
}
private InputStream multiple(String path) throws FileNotFoundException {
File[] libs = new File[0];
File file = new File(path);
if (file.exists() && file.canRead()) {
if (file.isFile()) {
libs = new File[1];
libs[0] = file;
} else if (file.isDirectory()) {
File[] files = file.listFiles(new FileFilter() {
@Override
public boolean accept(File file) {
return file.canRead() && !file.isHidden() && !file.isDirectory();
}
});
if (files != null && files.length > 0) {
libs = files;
}
}
}
if (libs.length == 0) {
throw new LibraryException("not find any file in path : " + path);
}
if (libs.length == 1) {
return new FileInputStream(libs[0]);
}
Vector<InputStream> vector = new Vector<>(libs.length);
for (int i = 0; i < libs.length; i++) {
vector.add(new FileInputStream(libs[i]));
}
return new SequenceInputStream(vector.elements());
}
}

View File

@ -1,50 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic.impl;
import org.ansj.dic.DicReader;
import org.ansj.dic.PathToStream;
import org.ansj.exception.LibraryException;
import org.deeplearning4j.common.config.DL4JClassLoading;
import java.io.InputStream;
public class Jar2Stream extends PathToStream {
@Override
public InputStream toStream(String path) {
if (path.contains("|")) {
String[] tokens = path.split("\\|");
String className = tokens[0].substring(6);
String resourceName = tokens[1].trim();
Class<Object> resourceClass = DL4JClassLoading.loadClassByName(className);
if (resourceClass == null) {
throw new LibraryException(String.format("Class '%s' was not found.", className));
}
return resourceClass.getResourceAsStream(resourceName);
} else {
return DicReader.getInputStream(path.substring(6));
}
}
}

View File

@ -1,115 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic.impl;
import org.ansj.dic.PathToStream;
import org.ansj.exception.LibraryException;
import org.deeplearning4j.common.config.DL4JClassLoading;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
public class Jdbc2Stream extends PathToStream {
private static final byte[] TAB = "\t".getBytes();
private static final byte[] LINE = "\n".getBytes();
private static final String[] JDBC_DRIVERS = {
"org.h2.Driver",
"com.ibm.db2.jcc.DB2Driver",
"org.hsqldb.jdbcDriver",
"org.gjt.mm.mysql.Driver",
"oracle.jdbc.OracleDriver",
"org.postgresql.Driver",
"net.sourceforge.jtds.jdbc.Driver",
"com.microsoft.sqlserver.jdbc.SQLServerDriver",
"org.sqlite.JDBC",
"com.mysql.jdbc.Driver"
};
static {
loadJdbcDrivers();
}
static void loadJdbcDrivers() {
for (String driverClassName : JDBC_DRIVERS) {
DL4JClassLoading.loadClassByName(driverClassName);
}
}
@Override
public InputStream toStream(String path) {
path = path.substring(7);
String[] split = path.split("\\|");
String jdbc = split[0];
String username = split[1];
String password = split[2];
String sqlStr = split[3];
String logStr = jdbc + "|" + username + "|********|" + sqlStr;
try (Connection conn = DriverManager.getConnection(jdbc, username, password);
PreparedStatement statement = conn.prepareStatement(sqlStr);
ResultSet rs = statement.executeQuery();
ByteArrayOutputStream baos = new ByteArrayOutputStream(100 * 1024)) {
int i, count;
while (rs.next()) {
for (i = 1, count = rs.getMetaData().getColumnCount(); i < count; ++i) {
baos.write(String.valueOf(rs.getObject(i)).getBytes());
baos.write(TAB);
}
baos.write(String.valueOf(rs.getObject(count)).getBytes());
baos.write(LINE);
}
return new ByteArrayInputStream(baos.toByteArray());
} catch (Exception e) {
throw new LibraryException("err to load by jdbc " + logStr);
}
}
public static String encryption(String path) {
String[] split = path.split("\\|");
String jdbc = split[0];
String username = split[1];
String password = split[2];
String sqlStr = split[3];
return jdbc + "|" + username + "|********|" + sqlStr;
}
}

View File

@ -1,42 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.dic.impl;
import org.ansj.dic.PathToStream;
import org.ansj.exception.LibraryException;
import java.io.InputStream;
import java.net.URL;
public class Url2Stream extends PathToStream {
@Override
public InputStream toStream(String path) {
try {
URL url = new URL(path);
return url.openStream();
} catch (Exception e) {
throw new LibraryException("err to load by http " + path + " message : " + e.getMessage());
}
}
}

View File

@ -1,82 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import org.nlpcn.commons.lang.dat.Item;
import java.io.Serializable;
import java.util.Map;
public class AnsjItem extends Item implements Serializable {
private static final long serialVersionUID = 1L;
public static final AnsjItem NULL = new AnsjItem();
public static final AnsjItem BEGIN = new AnsjItem();
public static final AnsjItem END = new AnsjItem();
static {
NULL.base = 0;
BEGIN.index = 0;
BEGIN.termNatures = TermNatures.BEGIN;
END.index = -1;
END.termNatures = TermNatures.END;
}
public String param;
/**
* frequency : ,
*/
public TermNatures termNatures = null;
public Map<Integer, Integer> bigramEntryMap = null;
@Override
public void init(String[] split) {
this.name = split[0];
this.param = split[1];
}
@Override
public void initValue(String[] split) {
index = Integer.parseInt(split[0]);
base = Integer.parseInt(split[2]);
check = Integer.parseInt(split[3]);
status = Byte.parseByte(split[4]);
if (status > 1) {
name = split[1];
termNatures = new TermNatures(TermNature.setNatureStrToArray(split[5]), index);
} else {
termNatures = new TermNatures(TermNature.NULL);
}
}
@Override
public String toText() {
return index + "\t" + name + "\t" + base + "\t" + check + "\t" + status + "\t" + param;
}
}

View File

@ -1,53 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
public class KV<K, V> {
private K k;
private V v;
private KV(K k, V v) {
this.k = k;
this.v = v;
}
public static <K, V> KV<K, V> with(K k, V v) {
return new KV<>(k, v);
}
public void setK(K k) {
this.k = k;
}
public void setV(V v) {
this.v = v;
}
public K getK() {
return k;
}
public V getV() {
return v;
}
}

View File

@ -1,73 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import org.ansj.library.NatureLibrary;
import java.io.Serializable;
/**
* .
*
* @author ansj
*
*/
public class Nature implements Serializable {
/**
*
*/
private static final long serialVersionUID = -1427092012930357598L;
// 词性的名称
public final String natureStr;
// 词性对照表的位置
public final int index;
// 词性的下标值
public final int natureIndex;
// 词性的频率
public final int allFrequency;
public static final Nature NW = NatureLibrary.getNature("nw");
public static final Nature NRF = NatureLibrary.getNature("nrf");
public static final Nature NR = NatureLibrary.getNature("nr");
public static final Nature NULL = NatureLibrary.getNature("null");
public Nature(String natureStr, int index, int natureIndex, int allFrequency) {
this.natureStr = natureStr;
this.index = index;
this.natureIndex = natureIndex;
this.allFrequency = allFrequency;
}
public Nature(String natureStr) {
this.natureStr = natureStr;
this.index = 0;
this.natureIndex = 0;
this.allFrequency = 0;
}
@Override
public String toString() {
return natureStr + ":" + index + ":" + natureIndex;
}
}

View File

@ -1,116 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import java.io.Serializable;
/**
* ,
*
* @author ansj
*
*/
public class NewWord implements Serializable {
/**
*
*/
private static final long serialVersionUID = 7226797287286838356L;
// 名字
private String name;
// 分数
private double score;
// 词性
private Nature nature;
// 总词频
private int allFreq;
// 此词是否被激活
private boolean isActive;
public NewWord(String name, Nature nature, double score) {
this.name = name;
this.nature = nature;
this.score = score;
this.allFreq = 1;
}
public NewWord(String name, Nature nature) {
this.name = name;
this.nature = nature;
this.allFreq = 1;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getScore() {
return score;
}
public Nature getNature() {
return nature;
}
public void setNature(Nature nature) {
this.nature = nature;
}
/**
* ,
*
* @param version
* @param i
* @param tn
*/
public void update(Nature nature, int freq) {
this.score += score * freq;
this.allFreq += freq;
if (Nature.NW != nature) {
this.nature = nature;
}
}
@Override
public String toString() {
return this.name + "\t" + this.score + "\t" + this.getNature().natureStr;
}
public int getAllFreq() {
return allFreq;
}
public void setScore(double score) {
this.score = score;
}
public boolean isActive() {
return isActive;
}
public void setActive(boolean isActive) {
this.isActive = isActive;
}
}

View File

@ -1,44 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import java.io.Serializable;
public class NumNatureAttr implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
public static final NumNatureAttr NULL = new NumNatureAttr();
// 是有可能是一个数字
public int numFreq = -1;
// 数字的结尾
public int numEndFreq = -1;
// 最大词性是否是数字
public boolean flag = false;
public NumNatureAttr() {}
}

View File

@ -1,129 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import java.io.Serializable;
/**
* pojo
*
* @author ansj
*
*/
public class PersonNatureAttr implements Serializable {
/**
*
*/
private static final long serialVersionUID = -8443825231800208197L;
// public int B = -1;//0 姓氏
// public int C = -1;//1 双名的首字
// public int D = -1;//2 双名的末字
// public int E = -1;//3 单名
// public int N = -1; //4任意字
// public int L = -1;//11 人名的下文
// public int M = -1;//12 两个中国人名之间的成分
// public int m = -1;//44 可拆分的姓名
// String[] parretn = {"BC", "BCD", "BCDE", "BCDEN"}
// double[] factory = {"BC", "BCD", "BCDE", "BCDEN"}
public static final PersonNatureAttr NULL = new PersonNatureAttr();
private int[][] locFreq = null;
public int split;
// 12
public int begin;
// 11+12
public int end;
public int allFreq;
// 是否有可能是名字的第一个字
public boolean flag;
/**
*
*
* @param index
* @param freq
*/
public void addFreq(int index, int freq) {
switch (index) {
case 11:
this.end += freq;
allFreq += freq;
break;
case 12:
this.end += freq;
this.begin += freq;
allFreq += freq;
break;
case 44:
this.split += freq;
allFreq += freq;
break;
}
}
/**
*
*
* @param length
* @param loc
* @return
*/
public int getFreq(int length, int loc) {
if (locFreq == null)
return 0;
if (length > 3)
length = 3;
if (loc > 4)
loc = 4;
return locFreq[length][loc];
}
/**
*
*
* @param ints
*/
public void setlocFreq(int[][] ints) {
for (int i = 0; i < ints.length; i++) {
if (ints[i][0] > 0) {
flag = true;
break;
}
}
locFreq = ints;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("begin=" + begin);
sb.append(",");
sb.append("end=" + end);
sb.append(",");
sb.append("split=" + split);
return sb.toString();
}
}

View File

@ -1,114 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.util.StringUtil;
import java.util.Iterator;
import java.util.List;
/**
*
*
* @author Ansj
*
*/
public class Result implements Iterable<Term> {
private List<Term> terms = null;
public Result(List<Term> terms) {
this.terms = terms;
}
public List<Term> getTerms() {
return terms;
}
public void setTerms(List<Term> terms) {
this.terms = terms;
}
@Override
public Iterator<Term> iterator() {
return terms.iterator();
}
public int size() {
return terms.size();
}
public Term get(int index) {
return terms.get(index);
}
/**
*
*
* @return
*/
public Result recognition(Recognition re) {
re.recognition(this);
return this;
}
@Override
public String toString() {
return toString(",");
}
public String toString(String split) {
return StringUtil.joiner(this.terms, split);
}
/**
*
* @return
*/
public String toStringWithOutNature() {
return toStringWithOutNature(",");
}
/**
*
* @return
*/
public String toStringWithOutNature(String split) {
if (terms == null || terms.isEmpty()) {
return "";
}
Iterator<Term> iterator = terms.iterator();
StringBuilder sb = new StringBuilder(iterator.next().getRealName());
while (iterator.hasNext()) {
sb.append(split);
sb.append(iterator.next().getRealName());
}
return sb.toString();
}
}

View File

@ -1,320 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import org.ansj.util.MathUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
public class Term implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
// 当前词
private String name;
//
private String realName;
// 当前词的起始位置
private int offe;
// 词性列表
private TermNatures termNatures = TermNatures.NULL;
// 词性列表
private AnsjItem item = AnsjItem.NULL;
// 同一行内数据
private Term next;
// 分数
private double score = 0;
// 本身分数
private double selfScore = 1;
// 起始位置
private Term from;
// 到达位置
private Term to;
// 本身这个term的词性.需要在词性识别之后才会有值,默认是空
private Nature nature = Nature.NULL;
//是否是一个新词
private boolean newWord;
//同义词
private List<String> synonyms;
private List<Term> subTerm = null;
public Term(String name, int offe, AnsjItem item) {
super();
this.name = name;
this.offe = offe;
this.item = item;
if (item.termNatures != null) {
this.termNatures = item.termNatures;
if (termNatures.nature != null) {
this.nature = termNatures.nature;
}
}
}
public Term(String name, int offe, TermNatures termNatures) {
super();
this.name = name;
this.offe = offe;
this.termNatures = termNatures;
if (termNatures.nature != null) {
this.nature = termNatures.nature;
}
}
public Term(String name, int offe, String natureStr, int natureFreq) {
super();
this.name = name;
this.offe = offe;
TermNature termNature = new TermNature(natureStr, natureFreq);
this.nature = termNature.nature;
this.termNatures = new TermNatures(termNature);
}
// 可以到达的位置
public int toValue() {
return offe + name.length();
}
public int getOffe() {
return offe;
}
public void setOffe(int offe) {
this.offe = offe;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
/**
*
*
* @param term
*/
public void setPathScore(Term from, Map<String, Double> relationMap) {
// 维特比进行最优路径的构建
double score = MathUtil.compuScore(from, this, relationMap);
if (this.from == null || this.score == 0 || this.score >= score) {
this.setFromAndScore(from, score);
}
}
/**
* ,
*
* @param term
*/
public void setPathSelfScore(Term from) {
double score = this.selfScore + from.score;
// 维特比进行最优路径的构建
if (this.from == null || this.score > score) {
this.setFromAndScore(from, score);
}
}
private void setFromAndScore(Term from, double score) {
this.from = from;
this.score = score;
}
/**
* term
*
* @param term
* @param maxNature
*/
public Term merage(Term to) {
this.name = this.name + to.getName();
if (StringUtil.isNotBlank(this.realName) && StringUtil.isNotBlank(to.getRealName())) {
this.realName = this.realName + to.getRealName();
}
this.setTo(to.to);
return this;
}
/**
* term,
*
* @param term
* @param maxNature
*/
public Term merageWithBlank(Term to) {
this.name = this.name + to.getName();
this.realName = this.realName + to.getRealName();
this.setTo(to.to);
return this;
}
/**
*
*
* @param offe
*/
public void updateOffe(int offe) {
this.offe += offe;
}
public Term next() {
return next;
}
/**
*
*
* @param next
*
* @return
*/
public Term setNext(Term next) {
this.next = next;
return this;
}
public Term from() {
return from;
}
public Term to() {
return to;
}
public void setFrom(Term from) {
this.from = from;
}
public void setTo(Term to) {
this.to = to;
}
/**
* term
*
* @return
*/
public TermNatures termNatures() {
return termNatures;
}
public void setNature(Nature nature) {
this.nature = nature;
}
/**
* .
*
* @return
*/
public Nature natrue() {
return nature;
}
public String getNatureStr() {
return nature.natureStr;
}
@Override
public String toString() {
if ("null".equals(nature.natureStr)) {
return this.getRealName();
}
return this.getRealName() + "/" + nature.natureStr;
}
/**
* term0
*/
public void clearScore() {
this.score = 0;
this.selfScore = 0;
}
public void setSubTerm(List<Term> subTerm) {
this.subTerm = subTerm;
}
public List<Term> getSubTerm() {
return subTerm;
}
public String getRealName() {
if (realName == null) {
return name;
}
return realName;
}
public void setRealName(String realName) {
this.realName = realName;
}
public double score() {
return this.score;
}
public void score(double score) {
this.score = score;
}
public double selfScore() {
return this.selfScore;
}
public void selfScore(double selfScore) {
this.selfScore = selfScore;
}
public AnsjItem item() {
return this.item;
}
public boolean isNewWord() {
return newWord;
}
public void setNewWord(boolean newWord) {
this.newWord = newWord;
}
public void updateTermNaturesAndNature(TermNatures termNatures) {
this.termNatures = termNatures;
this.nature = termNatures.nature;
}
public List<String> getSynonyms() {
return synonyms;
}
public void setSynonyms(List<String> synonyms) {
this.synonyms = synonyms;
}
}

View File

@ -1,80 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import org.ansj.library.NatureLibrary;
import java.io.Serializable;
/**
*
*
* @author ansj
*/
public class TermNature implements Serializable {
/**
*
*/
private static final long serialVersionUID = 5538058744208591381L;
/**
*
*/
public static final TermNature M = new TermNature("m", 1);
public static final TermNature EN = new TermNature("en", 1);
public static final TermNature BEGIN = new TermNature("始##始", 1);
public static final TermNature END = new TermNature("末##末", 1);
public static final TermNature USER_DEFINE = new TermNature("userDefine", 1);
public static final TermNature NR = new TermNature("nr", 1);
public static final TermNature NT = new TermNature("nt", 1);
public static final TermNature NS = new TermNature("ns", 1);
public static final TermNature NW = new TermNature("nw", 1);
public static final TermNature NRF = new TermNature("nrf", 1);
public static final TermNature NULL = new TermNature("null", 1);
public Nature nature;
public int frequency;
public TermNature(String natureStr, int frequency) {
this.nature = NatureLibrary.getNature(natureStr);
this.frequency = frequency;
}
public static TermNature[] setNatureStrToArray(String natureStr) {
natureStr = natureStr.substring(1, natureStr.length() - 1);
String[] split = natureStr.split(",");
String[] strs = null;
Integer frequency = null;
TermNature[] all = new TermNature[split.length];
for (int i = 0; i < split.length; i++) {
strs = split[i].split("=");
frequency = Integer.parseInt(strs[1]);
all[i] = new TermNature(strs[0].trim(), frequency);
}
return all;
}
@Override
public String toString() {
return nature.natureStr + "/" + frequency;
}
}

View File

@ -1,160 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.domain;
import java.io.Serializable;
/**
* term
*
* @author ansj
*
*/
public class TermNatures implements Serializable {
private static final long serialVersionUID = 1L;
public static final TermNatures M = new TermNatures(TermNature.M);
public static final TermNatures NR = new TermNatures(TermNature.NR);
public static final TermNatures EN = new TermNatures(TermNature.EN);
public static final TermNatures END = new TermNatures(TermNature.END, 50610, -1);
public static final TermNatures BEGIN = new TermNatures(TermNature.BEGIN, 50610, 0);
public static final TermNatures NT = new TermNatures(TermNature.NT);
public static final TermNatures NS = new TermNatures(TermNature.NS);
public static final TermNatures NRF = new TermNatures(TermNature.NRF);
public static final TermNatures NW = new TermNatures(TermNature.NW);
public static final TermNatures NULL = new TermNatures(TermNature.NULL);;
/**
* term
*/
public TermNature[] termNatures = null;
/**
*
*/
public NumNatureAttr numAttr = NumNatureAttr.NULL;
/**
*
*/
public PersonNatureAttr personAttr = PersonNatureAttr.NULL;
/**
*
*/
public Nature nature = null;
/**
*
*/
public int allFreq = 0;
/**
* id
*/
public int id = -2;
/**
* .
*
* @param termNatures
*/
public TermNatures(TermNature[] termNatures, int id) {
this.id = id;
this.termNatures = termNatures;
// find maxNature
int maxFreq = -1;
TermNature termNature = null;
for (int i = 0; i < termNatures.length; i++) {
if (maxFreq < termNatures[i].frequency) {
maxFreq = termNatures[i].frequency;
termNature = termNatures[i];
}
}
if (termNature != null) {
this.nature = termNature.nature;
}
serAttribute();
}
public TermNatures(TermNature termNature) {
termNatures = new TermNature[1];
this.termNatures[0] = termNature;
this.nature = termNature.nature;
serAttribute();
}
public TermNatures(TermNature termNature, int allFreq, int id) {
this.id = id;
termNatures = new TermNature[1];
termNature.frequency = allFreq;
this.termNatures[0] = termNature;
this.allFreq = allFreq;
}
private void serAttribute() {
TermNature termNature = null;
int max = 0;
NumNatureAttr numNatureAttr = null;
for (int i = 0; i < termNatures.length; i++) {
termNature = termNatures[i];
allFreq += termNature.frequency;
max = Math.max(max, termNature.frequency);
switch (termNature.nature.index) {
case 18:
if (numNatureAttr == null) {
numNatureAttr = new NumNatureAttr();
}
numNatureAttr.numFreq = termNature.frequency;
break;
case 29:
if (numNatureAttr == null) {
numNatureAttr = new NumNatureAttr();
}
numNatureAttr.numEndFreq = termNature.frequency;
break;
}
}
if (numNatureAttr != null) {
if (max == numNatureAttr.numFreq) {
numNatureAttr.flag = true;
}
this.numAttr = numNatureAttr;
}
}
public void setPersonNatureAttr(PersonNatureAttr personAttr) {
this.personAttr = personAttr;
}
}

View File

@ -1,35 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.exception;
public class LibraryException extends RuntimeException {
private static final long serialVersionUID = 1L;
public LibraryException(Exception e) {
super(e);
}
public LibraryException(String message) {
super(message);
}
}

View File

@ -1,233 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import java.io.BufferedReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class AmbiguityLibrary {
private static final Log LOG = MyStaticValue.getLog(AmbiguityLibrary.class);
// 同义词典
private static final Map<String, KV<String, Forest>> AMBIGUITY = new HashMap<>();
public static final String DEFAULT = "ambiguity";
static {
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
if (entry.getKey().startsWith(DEFAULT)) {
put(entry.getKey(), entry.getValue());
}
}
putIfAbsent(DEFAULT, "library/ambiguity.dic");
}
/**
*
*
* @return
*/
public static Forest get() {
if (!AMBIGUITY.containsKey(DEFAULT)) {
return null;
}
return get(DEFAULT);
}
/**
* key
*
*/
public static Forest get(String key) {
KV<String, Forest> kv = AMBIGUITY.get(key);
if (kv == null) {
if (MyStaticValue.ENV.containsKey(key)) {
putIfAbsent(key, MyStaticValue.ENV.get(key));
return get(key);
}
LOG.warn("crf " + key + " not found in config ");
return null;
}
Forest sw = kv.getV();
if (sw == null) {
try {
sw = init(key, kv, false);
} catch (Exception e) {
}
}
return sw;
}
/**
*
*
* @return
*/
private static synchronized Forest init(String key, KV<String, Forest> kv, boolean reload) {
Forest forest = kv.getV();
if (forest != null) {
if (reload) {
forest.clear();
} else {
return forest;
}
} else {
forest = new Forest();
}
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "utf-8")) {
String temp;
LOG.debug("begin init ambiguity");
long start = System.currentTimeMillis();
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
String[] split = temp.split("\t");
StringBuilder sb = new StringBuilder();
if (split.length % 2 != 0) {
LOG.error("init ambiguity error in line :" + temp + " format err !");
continue;
}
for (int i = 0; i < split.length; i += 2) {
sb.append(split[i]);
}
forest.addBranch(sb.toString(), split);
}
}
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
kv.setV(forest);
return forest;
} catch (Exception e) {
LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getK());
AMBIGUITY.remove(key);
return null;
}
}
/**
*
*
* @param key
* @param split
* @return
*/
public static void insert(String key, String... split) {
Forest forest = get(key);
StringBuilder sb = new StringBuilder();
if (split.length % 2 != 0) {
LOG.error("init ambiguity error in line :" + Arrays.toString(split) + " format err !");
return;
}
for (int i = 0; i < split.length; i += 2) {
sb.append(split[i]);
}
forest.addBranch(sb.toString(), split);
}
/**
*
*
* @param key
* @param value
*/
public static void insert(String key, Value value) {
Forest forest = get(key);
Library.insertWord(forest, value);
}
/**
*
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path) {
put(key, path, null);
}
public static void put(String key, String path, Forest value) {
AMBIGUITY.put(key, KV.with(path, value));
MyStaticValue.ENV.put(key, path);
}
/**
* key
*
* @param key
* @return
*/
public static KV<String, Forest> remove(String key) {
KV<String, Forest> kv = AMBIGUITY.get(key);
if (kv != null && kv.getV() != null) {
kv.getV().clear();
}
MyStaticValue.ENV.remove(key);
return AMBIGUITY.remove(key);
}
/**
* ,null
*
* @param key
* @return
*/
public static void reload(String key) {
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
remove(key);
}
putIfAbsent(key, MyStaticValue.ENV.get(key));
KV<String, Forest> kv = AMBIGUITY.get(key);
init(key, kv, true);
}
public static Set<String> keys() {
return AMBIGUITY.keySet();
}
public static void putIfAbsent(String key, String path) {
if (!AMBIGUITY.containsKey(key)) {
AMBIGUITY.put(key, KV.with(path, (Forest) null));
}
}
}

View File

@ -1,163 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.app.crf.Model;
import org.ansj.app.crf.SplitWord;
import org.ansj.app.crf.model.CRFModel;
import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.Log;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class CrfLibrary {
private static final Log LOG = MyStaticValue.getLog(CrfLibrary.class);
// CRF模型
private static final Map<String, KV<String, SplitWord>> CRF = new HashMap<>();
public static final String DEFAULT = "crf";
static {
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
if (entry.getKey().startsWith(DEFAULT)) {
put(entry.getKey(), entry.getValue());
}
}
putIfAbsent(DEFAULT, "jar://crf.model");
}
public static SplitWord get() {
return get(DEFAULT);
}
/**
* keycrf
*
* @param key
* @return crf
*/
public static SplitWord get(String key) {
KV<String, SplitWord> kv = CRF.get(key);
if (kv == null) {
if (MyStaticValue.ENV.containsKey(key)) {
putIfAbsent(key, MyStaticValue.ENV.get(key));
return get(key);
}
LOG.warn("crf " + key + " not found in config ");
return null;
}
SplitWord sw = kv.getV();
if (sw == null) {
sw = initCRFModel(kv);
}
return sw;
}
/**
* CRF
*
* @param modelPath
* @return
*/
private static synchronized SplitWord initCRFModel(KV<String, SplitWord> kv) {
try {
if (kv.getV() != null) {
return kv.getV();
}
long start = System.currentTimeMillis();
LOG.debug("begin init crf model!");
try (InputStream is = PathToStream.stream(kv.getK())) {
SplitWord crfSplitWord = new SplitWord(Model.load(CRFModel.class, is));
kv.setV(crfSplitWord);
LOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
return crfSplitWord;
}
} catch (Exception e) {
LOG.error(kv + " load err " + e.getMessage());
return null;
}
}
/**
*
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path) {
put(key, path, null);
}
public static void put(String key, String path, SplitWord sw) {
CRF.put(key, KV.with(path, sw));
MyStaticValue.ENV.put(key, path);
}
/**
* key
*
* @param key
* @return
*/
public static KV<String, SplitWord> remove(String key) {
MyStaticValue.ENV.remove(key);
return CRF.remove(key);
}
/**
* ,null
*
* @param key
* @return
*/
public static void reload(String key) {
KV<String, SplitWord> kv = CRF.get(key);
if (kv != null) {
CRF.get(key).setV(null);
}
LOG.warn("make sure ,this reload not use same obj , it to instance a new model");
}
public static Set<String> keys() {
return CRF.keySet();
}
public static void putIfAbsent(String key, String path) {
if (!CRF.containsKey(key)) {
CRF.put(key, KV.with(path, (SplitWord) null));
}
}
}

View File

@ -1,167 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.dic.DicReader;
import org.ansj.domain.AnsjItem;
import org.ansj.domain.PersonNatureAttr;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.name.PersonAttrLibrary;
import org.nlpcn.commons.lang.dat.DoubleArrayTire;
import org.nlpcn.commons.lang.dat.Item;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
public class DATDictionary {
private static final Log LOG = LogFactory.getLog(DATDictionary.class);
/**
*
*/
private static final DoubleArrayTire DAT = loadDAT();
/**
*
*/
public static int arrayLength = DAT.arrayLength;
/**
*
*
* @return
*/
private static DoubleArrayTire loadDAT() {
long start = System.currentTimeMillis();
try {
DoubleArrayTire dat = DoubleArrayTire.loadText(DicReader.getInputStream("core.dic"), AnsjItem.class);
// 人名识别必备的
personNameFull(dat);
// 记录词典中的词语,并且清除部分数据
for (Item item : dat.getDAT()) {
if (item == null || item.getName() == null) {
continue;
}
if (item.getStatus() < 2) {
item.setName(null);
continue;
}
}
LOG.info("init core library ok use time : " + (System.currentTimeMillis() - start));
return dat;
} catch (InstantiationException e) {
LOG.warn("无法实例化", e);
} catch (IllegalAccessException e) {
LOG.warn("非法访问", e);
} catch (NumberFormatException e) {
LOG.warn("数字格式异常", e);
} catch (IOException e) {
LOG.warn("IO异常", e);
}
return null;
}
private static void personNameFull(DoubleArrayTire dat) throws NumberFormatException, IOException {
HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap();
AnsjItem ansjItem = null;
// 人名词性补录
Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet();
char c = 0;
String temp = null;
for (Entry<String, PersonNatureAttr> entry : entrySet) {
temp = entry.getKey();
if (temp.length() == 1 && (ansjItem = (AnsjItem) dat.getDAT()[temp.charAt(0)]) == null) {
ansjItem = new AnsjItem();
ansjItem.setBase(c);
ansjItem.setCheck(-1);
ansjItem.setStatus((byte) 3);
ansjItem.setName(temp);
dat.getDAT()[temp.charAt(0)] = ansjItem;
} else {
ansjItem = dat.getItem(temp);
}
if (ansjItem == null) {
continue;
}
if ((ansjItem.termNatures) == null) {
if (temp.length() == 1 && temp.charAt(0) < 256) {
ansjItem.termNatures = TermNatures.NULL;
} else {
ansjItem.termNatures = new TermNatures(TermNature.NR);
}
}
ansjItem.termNatures.setPersonNatureAttr(entry.getValue());
}
}
public static int status(char c) {
Item item = DAT.getDAT()[c];
if (item == null) {
return 0;
}
return item.getStatus();
}
/**
*
*
* @param word
* @return
*/
public static boolean isInSystemDic(String word) {
Item item = DAT.getItem(word);
return item != null && item.getStatus() > 1;
}
public static AnsjItem getItem(int index) {
AnsjItem item = DAT.getItem(index);
if (item == null) {
return AnsjItem.NULL;
}
return item;
}
public static AnsjItem getItem(String str) {
AnsjItem item = DAT.getItem(str);
if (item == null || item.getStatus() < 2) {
return AnsjItem.NULL;
}
return item;
}
public static int getId(String str) {
return DAT.getId(str);
}
}

View File

@ -1,309 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class DicLibrary {
private static final Log LOG = LogFactory.getLog();
public static final String DEFAULT = "dic";
public static final String DEFAULT_NATURE = "userDefine";
public static final Integer DEFAULT_FREQ = 1000;
public static final String DEFAULT_FREQ_STR = "1000";
// 用户自定义词典
private static final Map<String, KV<String, Forest>> DIC = new HashMap<>();
static {
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
if (entry.getKey().startsWith(DEFAULT)) {
put(entry.getKey(), entry.getValue());
}
}
putIfAbsent(DEFAULT, "library/default.dic");
Forest forest = get();
if (forest == null) {
put(DEFAULT, DEFAULT, new Forest());
}
}
/**
*
*
* @param keyword
* @param nature
* @param freq
*/
public static void insert(String key, String keyword, String nature, int freq) {
Forest dic = get(key);
String[] paramers = new String[2];
paramers[0] = nature;
paramers[1] = String.valueOf(freq);
Value value = new Value(keyword, paramers);
Library.insertWord(dic, value);
}
/**
*
*
* @param keyword
*/
public static void insert(String key, String keyword) {
insert(key, keyword, DEFAULT_NATURE, DEFAULT_FREQ);
}
/**
*
*/
public static void delete(String key, String word) {
Forest dic = get(key);
if (dic != null) {
Library.removeWord(dic, word);
}
}
/**
*
*/
public static void clear(String key) {
get(key).clear();
}
public static Forest get() {
if (!DIC.containsKey(DEFAULT)) {
return null;
}
return get(DEFAULT);
}
/**
* crf
*
* @param modelName
* @return
*/
public static Forest get(String key) {
KV<String, Forest> kv = DIC.get(key);
if (kv == null) {
if (MyStaticValue.ENV.containsKey(key)) {
putIfAbsent(key, MyStaticValue.ENV.get(key));
return get(key);
}
LOG.warn("dic " + key + " not found in config ");
return null;
}
Forest forest = kv.getV();
if (forest == null) {
forest = init(key, kv, false);
}
return forest;
}
/**
* keys
*
* @param keys
* @return
*/
public static Forest[] gets(String... keys) {
Forest[] forests = new Forest[keys.length];
for (int i = 0; i < forests.length; i++) {
forests[i] = get(keys[i]);
}
return forests;
}
/**
* keys
*
* @param keys
* @return
*/
public static Forest[] gets(Collection<String> keys) {
return gets(keys.toArray(new String[keys.size()]));
}
/**
*
*
* @param key
* @param path
* @return
*/
private synchronized static Forest init(String key, KV<String, Forest> kv, boolean reload) {
Forest forest = kv.getV();
if (forest != null) {
if (reload) {
forest.clear();
} else {
return forest;
}
} else {
forest = new Forest();
}
try {
LOG.debug("begin init dic !");
long start = System.currentTimeMillis();
String temp = null;
String[] strs = null;
Value value = null;
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
strs = temp.split("\t");
strs[0] = strs[0].toLowerCase();
// 如何核心辞典存在那么就放弃
if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
continue;
}
if (strs.length != 3) {
value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR);
} else {
value = new Value(strs[0], strs[1], strs[2]);
}
Library.insertWord(forest, value);
}
}
}
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
kv.setV(forest);
return forest;
} catch (Exception e) {
LOG.error("Init dic library error :" + e.getMessage() + ", path: " + kv.getK());
DIC.remove(key);
return null;
}
}
/**
*
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path, Forest forest) {
DIC.put(key, KV.with(path, forest));
MyStaticValue.ENV.put(key, path);
}
/**
*
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void putIfAbsent(String key, String path) {
if (!DIC.containsKey(key)) {
DIC.put(key, KV.with(path, (Forest) null));
}
}
/**
*
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path) {
put(key, path, null);
}
/**
*
*
* @param <T>
* @param <T>
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static synchronized Forest putIfAbsent(String key, String path, Forest forest) {
KV<String, Forest> kv = DIC.get(key);
if (kv != null && kv.getV() != null) {
return kv.getV();
}
put(key, path, forest);
return forest;
}
public static KV<String, Forest> remove(String key) {
KV<String, Forest> kv = DIC.get(key);
if (kv != null && kv.getV() != null) {
kv.getV().clear();
}
MyStaticValue.ENV.remove(key);
return DIC.remove(key);
}
public static Set<String> keys() {
return DIC.keySet();
}
public static void reload(String key) {
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
remove(key);
}
putIfAbsent(key, MyStaticValue.ENV.get(key));
KV<String, Forest> kv = DIC.get(key);
init(key, kv, true);
}
}

View File

@ -1,144 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.domain.Nature;
import org.ansj.domain.Term;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
public class NatureLibrary {
private static final Log logger = LogFactory.getLog(NatureLibrary.class);
private static final int YI = 1;
private static final int FYI = -1;
/**
* hashmap(.)
*/
private static final HashMap<String, Nature> NATUREMAP = new HashMap<>();
/**
* .natureARRAY,natureMap
*/
private static int[][] NATURETABLE = null;
/**
*
*/
static {
init();
}
private static void init() {
String split = "\t";
int maxLength = 0;
String temp = null;
String[] strs = null;
// 加载词对照性表
try (BufferedReader reader = MyStaticValue.getNatureMapReader()) {
int p0 = 0;
int p1 = 0;
int p2 = 0;
while ((temp = reader.readLine()) != null) {
strs = temp.split(split);
if (strs.length != 4)
continue;
p0 = Integer.parseInt(strs[0]);
p1 = Integer.parseInt(strs[1]);
p2 = Integer.parseInt(strs[3]);
NATUREMAP.put(strs[2], new Nature(strs[2], p0, p1, p2));
maxLength = Math.max(maxLength, p1);
}
} catch (IOException e) {
logger.warn("词性列表加载失败!", e);
}
// 加载词性关系
try (BufferedReader reader = MyStaticValue.getNatureTableReader()) {
NATURETABLE = new int[maxLength + 1][maxLength + 1];
int j = 0;
while ((temp = reader.readLine()) != null) {
if (StringUtil.isBlank(temp))
continue;
strs = temp.split(split);
for (int i = 0; i < strs.length; i++) {
NATURETABLE[j][i] = Integer.parseInt(strs[i]);
}
j++;
}
} catch (IOException e) {
logger.warn("加载词性关系失败!", e);
}
}
/**
*
*
* @param from
* @param to
* @return
*/
public static int getTwoNatureFreq(Nature from, Nature to) {
if (from.index < 0 || to.index < 0) {
return 0;
}
return NATURETABLE[from.index][to.index];
}
/**
* term
*
* @param fromTerm
* @param toTerm
* @return
*/
public static int getTwoTermFreq(Term fromTerm, Term toTerm) {
Nature from = fromTerm.natrue();
Nature to = toTerm.natrue();
if (from.index < 0 || to.index < 0) {
return 0;
}
return NATURETABLE[from.index][to.index];
}
/**
* .
*
* @param natureStr
* @return
*/
public static Nature getNature(String natureStr) {
Nature nature = NATUREMAP.get(natureStr);
if (nature == null) {
nature = new Nature(natureStr, FYI, FYI, YI);
NATUREMAP.put(natureStr, nature);
return nature;
}
return nature;
}
}

View File

@ -1,59 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.domain.Term;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
*
*
* @author ansj
*
*/
public class NgramLibrary {
static {
long start = System.currentTimeMillis();
MyStaticValue.initBigramTables();
LogFactory.getLog(NgramLibrary.class).info("init ngram ok use time :" + (System.currentTimeMillis() - start));
}
/**
*
*
* @param from
* @param to
* @return
*/
public static int getTwoWordFreq(Term from, Term to) {
if (from.item().bigramEntryMap == null) {
return 0;
}
Integer freq = from.item().bigramEntryMap.get(to.item().getIndex());
if (freq == null) {
return 0;
} else {
return freq;
}
}
}

View File

@ -1,271 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class StopLibrary {
private static final Log LOG = LogFactory.getLog();
public static final String DEFAULT = "stop";
// 用户自定义词典
private static final Map<String, KV<String, StopRecognition>> STOP = new HashMap<>();
static {
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
if (entry.getKey().startsWith(DEFAULT)) {
put(entry.getKey(), entry.getValue());
}
}
putIfAbsent(DEFAULT, "library/stop.dic");
}
/**
*
*
* @param key
* @param stopNatures
*/
public static void insertStopNatures(String key, String... filterNatures) {
StopRecognition fr = get(key);
fr.insertStopNatures(filterNatures);
}
/**
*
*
* @param key
* @param regexes
*/
public static void insertStopRegexes(String key, String... regexes) {
StopRecognition fr = get(key);
fr.insertStopRegexes(regexes);
}
/**
*
*
* @param key
* @param regexes
*/
public static void insertStopWords(String key, String... stopWords) {
StopRecognition fr = get(key);
fr.insertStopWords(stopWords);
}
/**
*
*
* @param key
* @param regexes
*/
public static void insertStopWords(String key, List<String> stopWords) {
StopRecognition fr = get(key);
fr.insertStopWords(stopWords);
}
public static StopRecognition get() {
return get(DEFAULT);
}
/**
* crf
*
* @param modelName
* @return
*/
public static StopRecognition get(String key) {
KV<String, StopRecognition> kv = STOP.get(key);
if (kv == null) {
if (MyStaticValue.ENV.containsKey(key)) {
putIfAbsent(key, MyStaticValue.ENV.get(key));
return get(key);
}
LOG.warn("STOP " + key + " not found in config ");
return null;
}
StopRecognition stopRecognition = kv.getV();
if (stopRecognition == null) {
stopRecognition = init(key, kv, false);
}
return stopRecognition;
}
/**
*
*
* @param key
* @param path
* @return
*/
private synchronized static StopRecognition init(String key, KV<String, StopRecognition> kv, boolean reload) {
StopRecognition stopRecognition = kv.getV();
if (stopRecognition != null) {
if (reload) {
stopRecognition.clear();
} else {
return stopRecognition;
}
} else {
stopRecognition = new StopRecognition();
}
try {
LOG.debug("begin init FILTER !");
long start = System.currentTimeMillis();
String temp = null;
String[] strs = null;
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
strs = temp.split("\t");
if (strs.length == 1) {
stopRecognition.insertStopWords(strs[0]);
} else {
switch (strs[1]) {
case "nature":
stopRecognition.insertStopNatures(strs[0]);
break;
case "regex":
stopRecognition.insertStopRegexes(strs[0]);
break;
default:
stopRecognition.insertStopWords(strs[0]);
break;
}
}
}
}
}
LOG.info("load stop use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
kv.setV(stopRecognition);
return stopRecognition;
} catch (Exception e) {
LOG.error("Init Stop library error :" + e.getMessage() + ", path: " + kv.getK());
STOP.remove(key);
return null;
}
}
/**
*
*
* @param FILTERDefault
* @param FILTERDefault2
* @param FILTER2
*/
public static void put(String key, String path, StopRecognition stopRecognition) {
STOP.put(key, KV.with(path, stopRecognition));
MyStaticValue.ENV.put(key, path);
}
/**
*
*
* @param FILTERDefault
* @param FILTERDefault2
* @param FILTER2
*/
public static void putIfAbsent(String key, String path) {
if (!STOP.containsKey(key)) {
STOP.put(key, KV.with(path, (StopRecognition) null));
}
}
/**
*
*
* @param FILTERDefault
* @param FILTERDefault2
* @param FILTER2
*/
public static void put(String key, String path) {
put(key, path, null);
}
/**
*
*
* @param <T>
* @param <T>
*
* @param FILTERDefault
* @param FILTERDefault2
* @param FILTER2
*/
public static synchronized StopRecognition putIfAbsent(String key, String path, StopRecognition stopRecognition) {
KV<String, StopRecognition> kv = STOP.get(key);
if (kv != null && kv.getV() != null) {
return kv.getV();
}
put(key, path, stopRecognition);
return stopRecognition;
}
public static KV<String, StopRecognition> remove(String key) {
KV<String, StopRecognition> kv = STOP.get(key);
if (kv != null && kv.getV() != null) {
kv.getV().clear();
}
MyStaticValue.ENV.remove(key);
return STOP.remove(key);
}
public static Set<String> keys() {
return STOP.keySet();
}
public static void reload(String key) {
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
remove(key);
}
putIfAbsent(key, MyStaticValue.ENV.get(key));
KV<String, StopRecognition> kv = STOP.get(key);
init(key, kv, true);
}
}

View File

@ -1,312 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library;
import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import java.io.BufferedReader;
import java.util.*;
import java.util.Map.Entry;
public class SynonymsLibrary {
private static final Log LOG = MyStaticValue.getLog(SynonymsLibrary.class);
// 同义词典
private static final Map<String, KV<String, SmartForest<List<String>>>> SYNONYMS = new HashMap<>();
public static final String DEFAULT = "synonyms";
static {
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
if (entry.getKey().startsWith(DEFAULT)) {
put(entry.getKey(), entry.getValue());
}
}
putIfAbsent(DEFAULT, "library/synonyms.dic");
}
public static SmartForest<List<String>> get() {
return get(DEFAULT);
}
/**
*/
public static SmartForest<List<String>> get(String key) {
KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key);
if (kv == null) {
if (MyStaticValue.ENV.containsKey(key)) {
putIfAbsent(key, MyStaticValue.ENV.get(key));
return get(key);
}
LOG.warn("crf " + key + " not found in config ");
return null;
}
SmartForest<List<String>> sw = kv.getV();
if (sw == null) {
sw = init(key, kv, false);
}
return sw;
}
/**
*
*
* @param key
* @param kv
* @param reload
* @return
*/
private static synchronized SmartForest<List<String>> init(String key, KV<String, SmartForest<List<String>>> kv,
boolean reload) {
SmartForest<List<String>> forest = kv.getV();
if (forest != null) {
if (reload) {
forest.clear();
} else {
return forest;
}
} else {
forest = new SmartForest<>();
}
LOG.debug("begin init synonyms " + kv.getK());
long start = System.currentTimeMillis();
try (BufferedReader reader = IOUtil.getReader(PathToStream.stream(kv.getK()), IOUtil.UTF8)) {
String temp = null;
while ((temp = reader.readLine()) != null) {
if (StringUtil.isBlank(temp)) {
continue;
}
String[] split = temp.split("\t");
List<String> list = new ArrayList<>();
for (String word : split) {
if (StringUtil.isBlank(word)) {
continue;
}
list.add(word);
}
if (split.length <= 1) {
LOG.warn(temp + " in synonymsLibrary not in to library !");
continue;
}
for (int i = 0; i < split.length; i++) {
forest.add(split[i], list);
}
}
kv.setV(forest);
LOG.info("load synonyms use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
return forest;
} catch (Exception e) {
LOG.error("Init synonyms library error :" + e.getMessage() + ", path: " + kv.getK());
SYNONYMS.remove(key);
return null;
}
}
/**
*
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path) {
put(key, path, null);
}
public static void put(String key, String path, SmartForest<List<String>> value) {
SYNONYMS.put(key, KV.with(path, value));
MyStaticValue.ENV.put(key, path);
}
/**
* key
*
* @param key
* @return
*/
public static KV<String, SmartForest<List<String>>> remove(String key) {
KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key);
if (kv != null && kv.getV() != null) { //先清空后删除
kv.getV().clear();
}
MyStaticValue.ENV.remove(key);
return SYNONYMS.remove(key);
}
/**
* ,null
*
* @param key
* @return
*/
public static void reload(String key) {
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
remove(key);
}
putIfAbsent(key, MyStaticValue.ENV.get(key));
KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key);
init(key, kv, true);
}
public static Set<String> keys() {
return SYNONYMS.keySet();
}
public static void putIfAbsent(String key, String path) {
if (!SYNONYMS.containsKey(key)) {
SYNONYMS.put(key, KV.with(path, (SmartForest<List<String>>) null));
}
}
/**
* [, , ] -> replace([,]) -> [,]
*
* @param words
*/
public static void insert(String key, String[] words) {
SmartForest<List<String>> synonyms = get(key);
List<String> list = new ArrayList<>();
for (String word : words) {
if (StringUtil.isBlank(word)) {
continue;
}
list.add(word);
}
if (list.size() <= 1) {
LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
return;
}
Set<String> set = findAllWords(key, words);
for (String word : list) {
set.remove(word);
synonyms.add(word, list);
}
for (String word : set) { //删除所有
synonyms.remove(word);
synonyms.getBranch(word).setParam(null);
}
}
private static Set<String> findAllWords(String key, String[] words) {
SmartForest<List<String>> synonyms = get(key);
Set<String> set = new HashSet<>();
for (String word : words) {
SmartForest<List<String>> branch = synonyms.getBranch(word);
if (branch != null) {
List<String> params = branch.getParam();
if (params != null) {
set.addAll(params);
}
}
}
return set;
}
/**
* [, , ] -> append([,]) -> [, , , ]
*
* @param words
*/
public static void append(String key, String[] words) {
SmartForest<List<String>> synonyms = get(key);
Set<String> set = new HashSet<>();
for (String word : words) {
if (StringUtil.isBlank(word)) {
continue;
}
set.add(word);
}
if (set.size() <= 1) {
LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
return;
}
set.addAll(findAllWords(key, words));
List<String> list = new ArrayList<>(set);
for (String word : list) {
synonyms.addBranch(word, list);
}
}
/**
* [, , ] -> remove() -> [, ]
*
* @param words
*/
public static void remove(String key, String word) {
SmartForest<List<String>> synonyms = get(key);
SmartForest<List<String>> branch = synonyms.getBranch(word);
if (branch == null || branch.getStatus() < 2) {
return;
}
List<String> params = branch.getParam();
synonyms.remove(word);
branch.setParam(null);
params.remove(word);
if (params.size() == 1) { //如果是1 个也删除
synonyms.remove(params.get(0));
params.remove(0);
} else {
params.remove(word);
}
}
}

View File

@ -1,73 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library.company;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
/**
*
*
* @author ansj
*
*/
public class CompanyAttrLibrary {
private static final Log logger = LogFactory.getLog();
private static HashMap<String, int[]> cnMap = null;
private CompanyAttrLibrary() {}
public static HashMap<String, int[]> getCompanyMap() {
if (cnMap != null) {
return cnMap;
}
init();
return cnMap;
}
// company_freq
private static void init() {
try (BufferedReader br = MyStaticValue.getCompanReader()) {
cnMap = new HashMap<>();
String temp = null;
String[] strs = null;
int[] cna = null;
while ((temp = br.readLine()) != null) {
strs = temp.split("\t");
cna = new int[2];
cna[0] = Integer.parseInt(strs[1]);
cna[1] = Integer.parseInt(strs[2]);
cnMap.put(strs[0], cna);
}
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
}

View File

@ -1,99 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.library.name;
import org.ansj.domain.PersonNatureAttr;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
/**
* hashmap便,
*
* @author ansj
*/
public class PersonAttrLibrary {
private static final Log logger = LogFactory.getLog();
private HashMap<String, PersonNatureAttr> pnMap = null;
public PersonAttrLibrary() {}
public HashMap<String, PersonNatureAttr> getPersonMap() {
if (pnMap != null) {
return pnMap;
}
init1();
init2();
return pnMap;
}
// name_freq
private void init2() {
Map<String, int[][]> personFreqMap = MyStaticValue.getPersonFreqMap();
Set<Entry<String, int[][]>> entrySet = personFreqMap.entrySet();
PersonNatureAttr pna = null;
for (Entry<String, int[][]> entry : entrySet) {
pna = pnMap.get(entry.getKey());
if (pna == null) {
pna = new PersonNatureAttr();
pna.setlocFreq(entry.getValue());
pnMap.put(entry.getKey(), pna);
} else {
pna.setlocFreq(entry.getValue());
}
}
}
// person.dic
private void init1() {
try (BufferedReader br = MyStaticValue.getPersonReader()) {
pnMap = new HashMap<>();
String temp = null;
String[] strs = null;
PersonNatureAttr pna = null;
while ((temp = br.readLine()) != null) {
pna = new PersonNatureAttr();
strs = temp.split("\t");
pna = pnMap.get(strs[0]);
if (pna == null) {
pna = new PersonNatureAttr();
}
pna.addFreq(Integer.parseInt(strs[1]), Integer.parseInt(strs[2]));
pnMap.put(strs[0], pna);
}
} catch (NumberFormatException e) {
logger.warn("数字格式不正确", e);
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
}

View File

@ -1,35 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition;
import org.ansj.domain.Result;
import java.io.Serializable;
/**
* ,,
*
* @author Ansj
*
*/
public interface Recognition extends Serializable {
public void recognition(Result result);
}

View File

@ -1,33 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition;
import org.ansj.domain.Term;
/**
* ,
*
* @author Ansj
*
*/
public interface TermArrRecognition {
public void recognition(Term[] terms);
}

View File

@ -1,197 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.arrimpl;
import org.ansj.domain.*;
import org.ansj.library.NgramLibrary;
import org.ansj.recognition.TermArrRecognition;
import org.ansj.util.TermUtil;
import org.ansj.util.TermUtil.InsertTermType;
import java.util.ArrayList;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class AsianPersonRecognition implements TermArrRecognition {
private static final double[] FACTORY = {0.16271366224044456, 0.8060521860870434, 0.031234151672511947};
private boolean skip = false;
private Term[] terms;
// 名称是否有歧异
// public int B = -1;//0 姓氏
// public int C = -1;//1 双名的首字
// public int D = -1;//2 双名的末字
// public int E = -1;//3 单名
// public int N = -1; //4任意字
// public int L = -1;//11 人名的下文
// public int M = -1;//12 两个中国人名之间的成分
// public int m = -1;//44 可拆分的姓名
// double[] factory = {"BC", "BCD", "BCDE"}
@Override
public void recognition(Term[] terms) {
this.terms = terms;
List<Term> termList = recogntion_();
for (Term term2 : termList) {
TermUtil.insertTerm(terms, term2, InsertTermType.SCORE_ADD_SORT);
}
}
private List<Term> recogntion_() {
Term term = null;
Term tempTerm = null;
List<Term> termList = new ArrayList<>();
int beginFreq = 10;
for (int i = 0; i < terms.length; i++) {
term = terms[i];
if (term == null || !term.termNatures().personAttr.flag) {
continue;
}
term.score(0);
term.selfScore(0);
int freq = 0;
for (int j = 2; j > -1; j--) {
freq = term.termNatures().personAttr.getFreq(j, 0);
if ((freq > 10) || (term.getName().length() == 2 && freq > 10)) {
tempTerm = nameFind(i, beginFreq, j);
if (tempTerm != null) {
termList.add(tempTerm);
// 如果是无争议性识别
if (skip) {
for (int j2 = i; j2 < tempTerm.toValue(); j2++) {
if (terms[j2] != null) {
terms[j2].score(0);
terms[j2].selfScore(0);
}
}
i = tempTerm.toValue() - 1;
break;
}
}
}
}
beginFreq = term.termNatures().personAttr.begin + 1;
}
return termList;
}
/**
*
*
* @param term
* @param offe
* @param freq
*/
private Term nameFind(int offe, int beginFreq, int size) {
StringBuilder sb = new StringBuilder();
int undefinite = 0;
skip = false;
PersonNatureAttr pna = null;
int index = 0;
int freq = 0;
double allFreq = 0;
Term term = null;
int i = offe;
for (; i < terms.length; i++) {
// 走到结尾处识别出来一个名字.
if (terms[i] == null) {
continue;
}
term = terms[i];
pna = term.termNatures().personAttr;
// 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环
if ((freq = pna.getFreq(size, index)) == 0) {
return null;
}
if (pna.allFreq > 0) {
undefinite++;
}
sb.append(term.getName());
allFreq += Math.log(term.termNatures().allFreq + 1);
allFreq += -Math.log((freq));
index++;
if (index == size + 2) {
break;
}
}
double score = -Math.log(FACTORY[size]);
score += allFreq;
double endFreq = 0;
// 开始寻找结尾词
boolean flag = true;
while (flag) {
i++;
if (i >= terms.length) {
endFreq = 10;
flag = false;
} else if (terms[i] != null) {
int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]);
if (twoWordFreq > 3) {
return null;
}
endFreq = terms[i].termNatures().personAttr.end + 1;
flag = false;
}
}
score -= Math.log(endFreq);
score -= Math.log(beginFreq);
if (score > -3) {
return null;
}
if (allFreq > 0 && undefinite > 0) {
return null;
}
skip = undefinite == 0;
term = new Term(sb.toString(), offe, TermNatures.NR);
term.selfScore(score);
return term;
}
public List<NewWord> getNewWords(Term[] terms) {
this.terms = terms;
List<NewWord> all = new ArrayList<>();
List<Term> termList = recogntion_();
for (Term term2 : termList) {
all.add(new NewWord(term2.getName(), Nature.NR));
}
return all;
}
public List<Term> getNewTerms() {
return recogntion_();
}
}

View File

@ -1,248 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.arrimpl;
import org.ansj.domain.Nature;
import org.ansj.domain.NewWord;
import org.ansj.domain.Term;
import org.ansj.domain.TermNatures;
import org.ansj.recognition.TermArrRecognition;
import org.ansj.util.TermUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import java.util.*;
/**
*
*
* @author ansj
*/
public class ForeignPersonRecognition implements TermArrRecognition {
private static final LinkedList<NameChar> PRLIST = new LinkedList<>();
private static NameChar INNAME = null;
private static HashSet<Character> ISNOTFIRST = new HashSet<>();
static {
NameChar trans_english = new NameChar(StringUtil.sortCharArray(
"·-—阿埃艾爱安昂敖奥澳笆芭巴白拜班邦保堡鲍北贝本比毕彼别波玻博勃伯泊卜布才采仓查差柴彻川茨慈次达大戴代丹旦但当道德得登迪狄蒂帝丁东杜敦多额俄厄鄂恩尔伐法范菲芬费佛夫福弗甫噶盖干冈哥戈革葛格各根古瓜哈海罕翰汗汉豪合河赫亨侯呼胡华霍基吉及加贾坚简杰金京久居君喀卡凯坎康考柯科可克肯库奎拉喇莱来兰郎朗劳勒雷累楞黎理李里莉丽历利立力连廉良列烈林隆卢虏鲁路伦仑罗洛玛马买麦迈曼茅茂梅门蒙盟米蜜密敏明摩莫墨默姆木穆那娜纳乃奈南内尼年涅宁纽努诺欧帕潘畔庞培佩彭皮平泼普其契恰强乔切钦沁泉让热荣肉儒瑞若萨塞赛桑瑟森莎沙山善绍舍圣施诗石什史士守斯司丝苏素索塔泰坦汤唐陶特提汀图土吐托陀瓦万王旺威韦维魏温文翁沃乌吾武伍西锡希喜夏相香歇谢辛新牙雅亚彦尧叶依伊衣宜义因音英雍尤于约宰泽增詹珍治中仲朱诸卓孜祖佐伽娅尕腓滕济嘉津赖莲琳律略慕妮聂裴浦奇齐琴茹珊卫欣逊札哲智兹芙汶迦珀琪梵斐胥黛"));
NameChar trans_russian = new NameChar(StringUtil.sortCharArray(
"·-阿安奥巴比彼波布察茨大德得丁杜尔法夫伏甫盖格哈基加坚捷金卡科可克库拉莱兰勒雷里历利连列卢鲁罗洛马梅蒙米姆娜涅宁诺帕泼普奇齐乔切日萨色山申什斯索塔坦特托娃维文乌西希谢亚耶叶依伊以扎佐柴达登蒂戈果海赫华霍吉季津柯理琳玛曼穆纳尼契钦丘桑沙舍泰图瓦万雅卓兹"));
// 注释掉了日本人名.表面上是抵制日货.背地里是处理不好..
// NameChar trans_japanese = new NameChar(
// StringUtil
// .sortCharArray("安奥八白百邦保北倍本比滨博步部彩菜仓昌长朝池赤川船淳次村大代岛稻道德地典渡尔繁饭风福冈高工宫古谷关广桂贵好浩和合河黑横恒宏后户荒绘吉纪佳加见健江介金今进井静敬靖久酒菊俊康可克口梨理里礼栗丽利立凉良林玲铃柳隆鹿麻玛美萌弥敏木纳南男内鸟宁朋片平崎齐千前浅桥琴青清庆秋丘曲泉仁忍日荣若三森纱杉山善上伸神圣石实矢世市室水顺司松泰桃藤天田土万望尾未文武五舞西细夏宪相小孝新星行雄秀雅亚岩杨洋阳遥野也叶一伊衣逸义益樱永由有佑宇羽郁渊元垣原远月悦早造则泽增扎宅章昭沼真政枝知之植智治中忠仲竹助椎子佐阪坂堀荻菅薰浜濑鸠筱"));
PRLIST.add(trans_english);
PRLIST.add(trans_russian);
// PRLIST.add(trans_japanese);
INNAME = new NameChar(StringUtil.sortCharArray(
"-·—丁万丘东丝中丹丽乃久义乌乔买于亚亨京什仑仓代以仲伊伍伏伐伦伯伽但佐佛佩依侯俄保儒克兰其兹内冈凯切列利别力加努劳勃勒北华卓南博卜卡卢卫厄历及古可史叶司各合吉吐君吾呼哈哥哲唐喀善喇喜嘉噶因图土圣坎坚坦埃培基堡塔塞增墨士夏多大夫奇奈奎契奥妮姆威娃娅娜孜季宁守安宜宰密察尔尕尤尧尼居山川差巴布希帕帝干平年库庞康廉弗强当彦彭彻彼律得德恩恰慈慕戈戴才扎托拉拜捷提摩敏敖敦文斐斯新施日旦旺昂明普智曼朗木本札朱李杜来杰林果查柯柴根格桑梅梵森楞次欣欧歇武比毕汀汉汗汤汶沁沃沙河治泉泊法波泰泼泽洛津济浦海涅温滕潘澳烈热爱牙特狄王玛玻珀珊珍班理琪琳琴瑞瑟瓜瓦甫申畔略登白皮盖盟相石祖福科穆立笆简米素索累约纳纽绍维罕罗翁翰考耶聂肉肯胡胥腓舍良色艾芙芬芭苏若英茂范茅茨茹荣莉莎莫莱莲菲萨葛蒂蒙虏蜜衣裴西詹让诗诸诺谢豪贝费贾赖赛赫路辛达迈连迦迪逊道那邦郎鄂采里金钦锡门阿陀陶隆雅雍雷霍革韦音额香马魏鲁鲍麦黎默黛齐"));
ISNOTFIRST.add('-');
ISNOTFIRST.add('·');
ISNOTFIRST.add('—');
}
private List<Term> tempList = new ArrayList<>();
private LinkedList<NameChar> prList = null;
private Term[] terms = null;
@Override
public void recognition(Term[] terms) {
this.terms = terms;
String name = null;
Term term = null;
reset();
for (int i = 0; i < terms.length; i++) {
if (terms[i] == null) {
continue;
}
term = terms[i];
// 如果名字的开始是人名的前缀,或者后缀.那么忽略
if (tempList.isEmpty()) {
if (term.termNatures().personAttr.end > 10) {
continue;
}
if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
continue;
}
}
name = term.getName();
if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
boolean flag = validate(name);
if (flag) {
tempList.add(term);
}
} else if (tempList.size() == 1) {
reset();
} else if (tempList.size() > 1) {
TermUtil.insertTerm(terms, tempList, TermNatures.NR);
reset();
}
}
}
private boolean validate(String name) {
boolean flag = false;
NameChar nameChar = null;
for (int j = 0; j < prList.size(); j++) {
nameChar = prList.get(j);
if (nameChar.contains(name)) {
flag = true;
} else {
prList.remove(j);
// 向后回退一位
j--;
}
}
return flag;
}
@SuppressWarnings("unchecked")
private void reset() {
tempList.clear();
prList = (LinkedList<NameChar>) PRLIST.clone();
}
public static boolean isFName(String name) {
for (int i = 0; i < name.length(); i++) {
if (!INNAME.contains(name.charAt(i))) {
return false;
}
}
return true;
}
private static class NameChar {
private char[] chars = null;
public NameChar(char[] chars) {
this.chars = chars;
}
public boolean contains(String name) {
return contains(name.charAt(0));
}
public boolean contains(char c) {
return Arrays.binarySearch(chars, c) > -1;
}
}
public List<NewWord> getNewWords(Term[] terms) {
this.terms = terms;
List<NewWord> all = new ArrayList<>();
String name = null;
Term term = null;
reset();
for (int i = 0; i < terms.length; i++) {
if (terms[i] == null) {
continue;
}
term = terms[i];
// 如果名字的开始是人名的前缀,或者后缀.那么忽略
if (tempList.isEmpty()) {
if (term.termNatures().personAttr.end > 10) {
continue;
}
if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
continue;
}
}
name = term.getName();
if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
boolean flag = validate(name);
if (flag) {
tempList.add(term);
}
} else if (tempList.size() == 1) {
reset();
} else if (tempList.size() > 1) {
StringBuilder sb = new StringBuilder();
for (Term temp : tempList) {
sb.append(temp.getName());
}
all.add(new NewWord(sb.toString(), Nature.NRF));
reset();
}
}
return all;
}
public List<Term> getNewTerms() {
LinkedList<Term> result = new LinkedList<>();
String name = null;
Term term = null;
reset();
for (int i = 0; i < terms.length; i++) {
if (terms[i] == null) {
continue;
}
term = terms[i];
// 如果名字的开始是人名的前缀,或者后缀.那么忽略
if (tempList.isEmpty()) {
if (term.termNatures().personAttr.end > 10) {
continue;
}
if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
continue;
}
}
name = term.getName();
if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
boolean flag = validate(name);
if (flag) {
tempList.add(term);
}
} else if (tempList.size() == 1) {
reset();
} else if (tempList.size() > 1) {
result.add(makeNewTerm());
reset();
}
}
return result;
}
public Term makeNewTerm() {
StringBuilder sb = new StringBuilder();
int offe = tempList.get(0).getOffe();
for (Term term : tempList) {
sb.append(term.getName());
}
return new Term(sb.toString(), offe, TermNatures.NR);
}
}

View File

@ -1,158 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.arrimpl;
import org.ansj.dic.LearnTool;
import org.ansj.domain.Nature;
import org.ansj.domain.NewWord;
import org.ansj.domain.Term;
import org.ansj.util.TermUtil;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
/**
*
*
* @author ansj
*
*/
public class NewWordRecognition {
private Term[] terms = null;
private double score;
private StringBuilder sb = new StringBuilder();
private SmartForest<NewWord> forest = null;
private SmartForest<NewWord> branch = null;
// private int offe = -1;
// private int endOffe = -1;
private Nature tempNature;
private Term from;
private Term to;
// 偏移量
private int offe;
public NewWordRecognition(LearnTool learn) {
forest = learn.getForest();
branch = learn.getForest();
}
public void recognition(Term[] terms) {
this.terms = terms;
if (branch == null) {
return;
}
int length = terms.length - 1;
Term term = null;
for (int i = 0; i < length; i++) {
if (terms[i] == null) {
continue;
} else {
from = terms[i].from();
terms[i].score(0);
terms[i].selfScore(0);
}
branch = branch.getBranch(terms[i].getName());
if (branch == null || branch.getStatus() == 3) {
reset();
continue;
}
offe = i;
// 循环查找添加
term = terms[i];
sb.append(term.getName());
if (branch.getStatus() == 2) {
term.selfScore(branch.getParam().getScore());
}
boolean flag = true;
while (flag) {
term = term.to();
branch = branch.getBranch(term.getName());
// 如果没有找到跳出
if (branch == null) {
break;
}
switch (branch.getStatus()) {
case 1:
sb.append(term.getName());
continue;
case 2:
sb.append(term.getName());
score = branch.getParam().getScore();
tempNature = branch.getParam().getNature();
to = term.to();
makeNewTerm();
continue;
case 3:
sb.append(term.getName());
score = branch.getParam().getScore();
tempNature = branch.getParam().getNature();
to = term.to();
makeNewTerm();
flag = false;
break;
default:
System.out.println("怎么能出现0呢?");
break;
}
}
reset();
}
}
private void makeNewTerm() {
Term term = new Term(sb.toString(), offe, tempNature.natureStr, 1);
term.selfScore(score);
term.setNature(tempNature);
if (sb.length() > 3) {
term.setSubTerm(TermUtil.getSubTerm(from, to));
}
TermUtil.termLink(from, term);
TermUtil.termLink(term, to);
TermUtil.insertTerm(terms, term, InsertTermType.SCORE_ADD_SORT);
TermUtil.parseNature(term);
}
/**
*
*/
private void reset() {
offe = -1;
tempNature = null;
branch = forest;
score = 0;
sb = new StringBuilder();
}
}

View File

@ -1,84 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.arrimpl;
import org.ansj.domain.Term;
import org.ansj.recognition.TermArrRecognition;
import org.ansj.util.MyStaticValue;
import org.ansj.util.TermUtil;
public class NumRecognition implements TermArrRecognition {
/**
* +,zheng
*
* @param terms
*/
@Override
public void recognition(Term[] terms) {
int length = terms.length - 1;
Term from = null;
Term to = null;
Term temp = null;
for (int i = 0; i < length; i++) {
if (terms[i] == null) {
continue;
} else if (".".equals(terms[i].getName()) || "".equals(terms[i].getName())) {
// 如果是.前后都为数字进行特殊处理
to = terms[i].to();
from = terms[i].from();
if (from.termNatures().numAttr.flag && to.termNatures().numAttr.flag) {
from.setName(from.getName() + "." + to.getName());
TermUtil.termLink(from, to.to());
terms[to.getOffe()] = null;
terms[i] = null;
i = from.getOffe() - 1;
}
continue;
} else if (!terms[i].termNatures().numAttr.flag) {
continue;
}
temp = terms[i];
// 将所有的数字合并
while ((temp = temp.to()).termNatures().numAttr.flag) {
terms[i].setName(terms[i].getName() + temp.getName());
}
// 如果是数字结尾
if (MyStaticValue.isQuantifierRecognition && temp.termNatures().numAttr.numEndFreq > 0) {
terms[i].setName(terms[i].getName() + temp.getName());
temp = temp.to();
}
// 如果不等,说明terms[i]发生了改变
if (terms[i].to() != temp) {
TermUtil.termLink(terms[i], temp);
// 将中间无用元素设置为null
for (int j = i + 1; j < temp.getOffe(); j++) {
terms[j] = null;
}
i = temp.getOffe() - 1;
}
}
}
}

View File

@ -1,185 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.arrimpl;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.TermArrRecognition;
import org.ansj.util.TermUtil;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* .
*
* @author ansj
*
*/
public class UserDefineRecognition implements TermArrRecognition {
public static final Log logger = LogFactory.getLog(UserDefineRecognition.class);
private Term[] terms = null;
private Forest[] forests = {DicLibrary.get()};
private int offe = -1;
private int endOffe = -1;
private int tempFreq = 50;
private String tempNature;
private SmartForest<String[]> branch = null;
private SmartForest<String[]> forest = null;
private InsertTermType type = InsertTermType.SKIP;
public UserDefineRecognition(InsertTermType type, Forest... forests) {
this.type = type;
if (forests != null && forests.length > 0) {
this.forests = forests;
}
}
@Override
public void recognition(Term[] terms) {
this.terms = terms;
for (Forest forest : forests) {
if (forest == null) {
continue;
}
reset();
this.forest = forest;
branch = forest;
int length = terms.length - 1;
boolean flag = true;
for (int i = 0; i < length; i++) {
if (terms[i] == null)
continue;
if (branch == forest) {
flag = false;
} else {
flag = true;
}
branch = termStatus(branch, terms[i]);
if (branch == null) {
if (offe != -1) {
i = offe;
}
reset();
} else if (branch.getStatus() == 3) {
endOffe = i;
tempNature = branch.getParam()[0];
tempFreq = getInt(branch.getParam()[1], 50);
if (offe != -1 && offe < endOffe) {
i = offe;
makeNewTerm();
reset();
} else {
reset();
}
} else if (branch.getStatus() == 2) {
endOffe = i;
if (offe == -1) {
offe = i;
} else {
tempNature = branch.getParam()[0];
tempFreq = getInt(branch.getParam()[1], 50);
if (flag) {
makeNewTerm();
}
}
} else if (branch.getStatus() == 1) {
if (offe == -1) {
offe = i;
}
}
}
if (offe != -1 && offe < endOffe) {
makeNewTerm();
}
}
}
private int getInt(String str, int def) {
try {
return Integer.parseInt(str);
} catch (NumberFormatException e) {
logger.warn(str + "不是一个数字", e);
return def;
}
}
private void makeNewTerm() {
StringBuilder sb = new StringBuilder();
for (int j = offe; j <= endOffe; j++) {
if (terms[j] == null) {
continue;
} else {
sb.append(terms[j].getName());
}
}
TermNatures termNatures = new TermNatures(new TermNature(tempNature, tempFreq));
Term term = new Term(sb.toString(), offe, termNatures);
term.selfScore(-1 * tempFreq);
TermUtil.insertTerm(terms, term, type);
}
/**
*
*/
private void reset() {
offe = -1;
endOffe = -1;
tempFreq = 50;
tempNature = null;
branch = forest;
}
/**
* term term
*
* @param branch
* @param term
* @return
*/
private SmartForest<String[]> termStatus(SmartForest<String[]> branch, Term term) {
String name = term.getName();
SmartForest<String[]> sf = branch;
for (int j = 0; j < name.length(); j++) {
sf = sf.get(name.charAt(j));
if (sf == null) {
return null;
}
}
return sf;
}
}

View File

@ -1,98 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Nature;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.Recognition;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
/**
* jijiang feidiao
*
* @author ansj
*
*/
public class BookRecognition implements Recognition {
/**
*
*/
private static final long serialVersionUID = 1L;
private static final Nature nature = new Nature("book");
private static Map<String, String> ruleMap = new HashMap<>();
static {
ruleMap.put("《", "》");
}
@Override
public void recognition(Result result) {
List<Term> terms = result.getTerms();
String end = null;
String name;
LinkedList<Term> mergeList = null;
List<Term> list = new LinkedList<>();
for (Term term : terms) {
name = term.getName();
if (end == null) {
if ((end = ruleMap.get(name)) != null) {
mergeList = new LinkedList<>();
mergeList.add(term);
} else {
list.add(term);
}
} else {
mergeList.add(term);
if (end.equals(name)) {
Term ft = mergeList.pollFirst();
for (Term sub : mergeList) {
ft.merage(sub);
}
ft.setNature(nature);
list.add(ft);
mergeList = null;
end = null;
}
}
}
if (mergeList != null) {
for (Term term : list) {
list.add(term);
}
}
result.setTerms(list);
}
}

View File

@ -1,71 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.tire.domain.Forest;
import java.util.List;
public class DicRecognition implements Recognition {
private static final long serialVersionUID = 7487741700410080896L;
private Forest[] forests = null;
public DicRecognition() {
forests = DicLibrary.gets(DicLibrary.DEFAULT);
}
public DicRecognition(String[] keys) {
forests = DicLibrary.gets(keys);
}
/**
* @param forests
*/
public DicRecognition(Forest[] forests) {
this.forests = forests;
}
public DicRecognition(Forest forest) {
this.forests = new Forest[] {forest};
}
@Override
public void recognition(Result result) {
for (Forest forest : forests) {
if (forest == null) {
continue;
}
recognition(result, forest);
}
}
private void recognition(Result result, Forest forest) {
List<Term> terms = result.getTerms();
}
}

View File

@ -1,75 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.Recognition;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
*
*
* @author ansj
*
*/
public class EmailRecognition implements Recognition {
private static Map<String, String> FEATURE = new HashMap<>();
private static final String NOT_HEAD = "NOT";
private static final String NATURE_HEAD = "nature:";
private static final String ALL = "ALL";
static {
FEATURE.put("-", NOT_HEAD);
FEATURE.put("_", NOT_HEAD);
FEATURE.put(".", NOT_HEAD);
FEATURE.put(NATURE_HEAD + "en", ALL);
FEATURE.put(NATURE_HEAD + "m", ALL);
}
@Override
public void recognition(Result result) {
List<Term> terms = result.getTerms();
for (Term term : terms) {
if (!"@".equals(term.getName())) {
continue;
}
}
for (Iterator<Term> iterator = terms.iterator(); iterator.hasNext();) {
Term term = iterator.next();
if (term.getName() == null) {
iterator.remove();
}
}
}
}

View File

@ -1,75 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Nature;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.Recognition;
import java.util.Iterator;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class IDCardRecognition implements Recognition {
/**
*
*/
private static final long serialVersionUID = -32133440735240290L;
private static final Nature ID_CARD_NATURE = new Nature("idcard");
@Override
public void recognition(Result result) {
List<Term> terms = result.getTerms();
for (Term term : terms) {
if ("m".equals(term.getNatureStr())) {
if (term.getName().length() == 18) {
term.setNature(ID_CARD_NATURE);
} else if (term.getName().length() == 17) {
Term to = term.to();
if ("x".equals(to.getName())) {
term.merage(to);
to.setName(null);
term.setNature(ID_CARD_NATURE);
}
}
}
}
for (Iterator<Term> iterator = terms.iterator(); iterator.hasNext();) {
Term term = iterator.next();
if (term.getName() == null) {
iterator.remove();
}
}
}
}

View File

@ -1,306 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.*;
import org.ansj.library.DATDictionary;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.Recognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MathUtil;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class NatureRecognition implements Recognition {
private static final long serialVersionUID = 1L;
private static final Log logger = LogFactory.getLog();
private static final Forest SUFFIX_FOREST = new Forest();
private Forest[] forests = null;
static {
try (BufferedReader reader = MyStaticValue.getNatureClassSuffix()) {
String temp = null;
while ((temp = reader.readLine()) != null) {
String[] split = temp.split("\t");
String word = split[0];
if (word.length() > 1) {
word = new StringBuffer(word).reverse().toString();
}
SUFFIX_FOREST.add(word, new String[] {split[1]});
}
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
public NatureRecognition() {
forests = new Forest[] {DicLibrary.get()};
}
public NatureRecognition(Forest... forests) {
this.forests = forests;
}
private NatureTerm root = new NatureTerm(TermNature.BEGIN);
private NatureTerm[] end = {new NatureTerm(TermNature.END)};
private List<Term> terms = null;
private NatureTerm[][] natureTermTable = null;
/**
* ,.
*/
@Override
public void recognition(Result result) {
this.terms = result.getTerms();
natureTermTable = new NatureTerm[terms.size() + 1][];
natureTermTable[terms.size()] = end;
int length = terms.size();
for (int i = 0; i < length; i++) {
natureTermTable[i] = getNatureTermArr(terms.get(i).termNatures().termNatures);
}
walk();
}
/**
*
*
* @param words
* @param offe
* @return
*/
public List<Term> recognition(List<String> words) {
return recognition(words, 0);
}
/**
*
*
* @param words
* @param offe
* @return
*/
public List<Term> recognition(List<String> words, int offe) {
List<Term> terms = new ArrayList<>(words.size());
int tempOffe = 0;
for (String word : words) {
TermNatures tn = getTermNatures(word);
terms.add(new Term(word, offe + tempOffe, tn));
tempOffe += word.length();
}
new NatureRecognition().recognition(new Result(terms));
return terms;
}
/**
*
*
* @param word
* @return
*/
public TermNatures getTermNatures(String word) {
String[] params = null;
// 获得词性 先从系统辞典。在从用户自定义辞典
AnsjItem ansjItem = DATDictionary.getItem(word);
TermNatures tn = null;
if (ansjItem != AnsjItem.NULL) {
tn = ansjItem.termNatures;
} else if ((params = getParams(word)) != null) {
tn = new TermNatures(new TermNature(params[0], 1));
} else if (WordAlert.isEnglish(word)) {
tn = TermNatures.EN;
} else if (WordAlert.isNumber(word)) {
tn = TermNatures.M;
} else {
tn = TermNatures.NULL;
}
return tn;
}
/**
*
*
* @param word
* @return
*/
public String[] getParams(String word) {
for (Forest forest : forests) {
if (forest == null) {
continue;
}
SmartForest<String[]> sf = forest;
for (int i = 0; i < word.length(); i++) {
sf = sf.get(word.charAt(i));
if (sf == null) {
return null;
}
}
if (sf.getStatus() > 1) {
return sf.getParam();
} else {
return null;
}
}
return null;
}
/**
*
*
* @param word
* @return
*/
public static TermNatures guessNature(String word) {
String nature = null;
SmartForest<String[]> smartForest = SUFFIX_FOREST;
int len = 0;
for (int i = word.length() - 1; i >= 0; i--) {
smartForest = smartForest.get(word.charAt(i));
if (smartForest == null) {
break;
}
len++;
if (smartForest.getStatus() == 2) {
nature = smartForest.getParam()[0];
} else if (smartForest.getStatus() == 3) {
nature = smartForest.getParam()[0];
break;
}
}
if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
return TermNatures.NT;
} else if ("ns".equals(nature)) {
return TermNatures.NS;
} else if (word.length() < 5) {
Result parse = ToAnalysis.parse(word);
for (Term term : parse.getTerms()) {
if ("nr".equals(term.getNatureStr())) {
return TermNatures.NR;
}
}
} else if (ForeignPersonRecognition.isFName(word)) {
return TermNatures.NRF;
}
return TermNatures.NW;
}
public void walk() {
int length = natureTermTable.length - 1;
setScore(root, natureTermTable[0]);
for (int i = 0; i < length; i++) {
for (int j = 0; j < natureTermTable[i].length; j++) {
setScore(natureTermTable[i][j], natureTermTable[i + 1]);
}
}
optimalRoot();
}
private void setScore(NatureTerm natureTerm, NatureTerm[] natureTerms) {
for (int i = 0; i < natureTerms.length; i++) {
natureTerms[i].setScore(natureTerm);
}
}
private NatureTerm[] getNatureTermArr(TermNature[] termNatures) {
NatureTerm[] natureTerms = new NatureTerm[termNatures.length];
for (int i = 0; i < natureTerms.length; i++) {
natureTerms[i] = new NatureTerm(termNatures[i]);
}
return natureTerms;
}
/**
*
*/
private void optimalRoot() {
NatureTerm to = end[0];
NatureTerm from = null;
int index = natureTermTable.length - 1;
while ((from = to.from) != null && index > 0) {
terms.get(--index).setNature(from.termNature.nature);
to = from;
}
}
/**
* term
*
* @author ansj
*
*/
public class NatureTerm {
public TermNature termNature;
public double score = 0;
public double selfScore;
public NatureTerm from;
protected NatureTerm(TermNature termNature) {
this.termNature = termNature;
selfScore = termNature.frequency + 1;
}
public void setScore(NatureTerm natureTerm) {
double tempScore = MathUtil.compuNatureFreq(natureTerm, this);
if (from == null || score < tempScore) {
this.score = tempScore;
this.from = natureTerm;
}
}
@Override
public String toString() {
return termNature.nature.natureStr + "/" + selfScore;
}
}
}

View File

@ -1,151 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.util.*;
import java.util.regex.Pattern;
/**
* ,,.
*
* @author Ansj
*
*/
public class StopRecognition implements Recognition {
private static final Log LOG = LogFactory.getLog();
/**
*
*/
private static final long serialVersionUID = 7041503137429986566L;
private Set<String> stop = new HashSet<>();
private Set<String> natureStop = new HashSet<>();
private Set<Pattern> regexList = new HashSet<>();
/**
*
*
* @param filterWords
* @return
*/
public StopRecognition insertStopWords(Collection<String> filterWords) {
stop.addAll(filterWords);
return this;
}
/**
*
*
* @param stopWords
* @return
*/
public StopRecognition insertStopWords(String... stopWords) {
for (String words : stopWords) {
stop.add(words);
}
return this;
}
/**
* nr .
*
* @param stopWords
*/
public void insertStopNatures(String... stopNatures) {
for (String natureStr : stopNatures) {
natureStop.add(natureStr);
}
}
/**
*
*
* @param regex
*/
public void insertStopRegexes(String... regexes) {
for (String regex : regexes) {
try {
regexList.add(Pattern.compile(regex));
} catch (Exception e) {
LOG.error("regex err : " + regex, e);
}
}
}
@Override
public void recognition(Result result) {
List<Term> list = result.getTerms();
Iterator<Term> iterator = list.iterator();
while (iterator.hasNext()) {
Term term = iterator.next();
if (filter(term)) {
iterator.remove();
}
}
}
/**
* ..
*
* @param term
* @return
*/
public boolean filter(Term term) {
if (!stop.isEmpty() && (stop.contains(term.getName()))) {
return true;
}
if (!natureStop.isEmpty() && (natureStop.contains(term.natrue().natureStr))) {
return true;
}
if (!regexList.isEmpty()) {
for (Pattern stopwordPattern : regexList) {
if (stopwordPattern.matcher(term.getName()).matches()) {
return true;
}
}
}
return false;
}
public void clear() {
this.stop.clear();
this.natureStop.clear();
this.regexList.clear();
}
}

View File

@ -1,68 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.library.SynonymsLibrary;
import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import java.util.List;
/**
*
*
* @author Ansj
*
*/
public class SynonymsRecgnition implements Recognition {
private static final long serialVersionUID = 5961499108093950130L;
private SmartForest<List<String>> synonyms = null;
public SynonymsRecgnition() {
this.synonyms = SynonymsLibrary.get();
}
public SynonymsRecgnition(String key) {
this.synonyms = SynonymsLibrary.get(key);
}
public SynonymsRecgnition(SmartForest<List<String>> synonyms) {
this.synonyms = synonyms;
}
@Override
public void recognition(Result result) {
for (Term term : result) {
SmartForest<List<String>> branch = synonyms.getBranch(term.getName());
if (branch != null && branch.getStatus() > 1) {
List<String> syns = branch.getParam();
if (syns != null) {
term.setSynonyms(syns);
}
}
}
}
}

View File

@ -1,96 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Nature;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.Recognition;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
*
* @author sunyang
*
*/
public class TimeRecognition implements Recognition {
/**
*
*/
private static final long serialVersionUID = 1L;
private static final Nature nature = new Nature("t");
@Override
public void recognition(Result result) {
String name = "";
String timeWord = "";
List<Term> terms = result.getTerms();
LinkedList<Term> mergeList = new LinkedList<>();
List<Term> list = new LinkedList<>();
Pattern pattern =
Pattern.compile("((\\d|[]){1,4}年(\\d|[]){1,2}月(\\d|[]){1,2}[日|号](上午|下午|中午|晚)?(\\s)*((\\d|[]){1,2}([点|时|點|時])?((:)?(\\d|[]){1,2}(分)?((:)?(\\d|[]){1,2}(秒)?)?)?)?(\\s)*(PM|AM)?|(\\d|[]){1,2}(月|月份)(\\d|[]){1,2}([日|号])?(上午|下午|中午|晚)?(\\s)*((\\d|[]){1,2}([点|时|點|時])?((:)?(\\d|[]){1,2}(分)?((:)?(\\d|[]){1,2}(秒)?)?)?)?(\\s)*(PM|AM)?|(\\d|[]){1,2}日(上午|下午|中午|晚)?(\\s)*((\\d|[]){1,2}([点|时|點|時])?((:)?(\\d|[]){1,2}(分)?((:)?(\\d|[]){1,2}(秒)?)?)?)?(\\s)*(PM|AM)?|(昨天|昨日|昨日上午|昨日下午|昨日晚上|昨天早上|昨天上午|昨天中午|昨天下午|昨晚|昨夜|昨天晚上|今天早上|今天上午|今天下午|今晚|今天晚上|今日上午|今日下午|今日|今天|前天|今年|去年|当日|当日上午|上午|下午|中午|清晨|前晚|早上|凌晨|今晨|近日|日前|不久前)((\\d|[]){1,2}[点|时|點|時])?((:)?(\\d|[]){1,2}(分)?((:)?(\\d|[]){1,2}(秒)?)?)?(\\s)*(PM|AM)?|[\\“|\"](1|2|3|4|5|6|7|8|9|10|11|12)[·|.| |-](\\d|[]){1,2}[\\”|\"]|星期[一|二|三|四|五|六|天|日]|(\\d|[]){1,2}[点|时|點|時]((:)?(\\d|[]){1,2}(分)?((:)?(\\d|[]){1,2}(秒)?)?)?(\\s)*(PM|AM)?|(\\d|[]){4}年((\\d|[]){1,2}月)?|(\\d|[]){1,2}月|(正|一|二|三|四|五|六|七|八|九|十|十一|十二|腊)月((初|十|二十|三十)[ 一二三四五六七八九十])?(上午|下午|中午|晚)?|((\\d|[]){4}-(\\d|[]){2}-(\\d|[]){2})?(\\s)*(\\d|[]){2}:(\\d|[]){2}:(\\d|[]){2}|(\\d|[]){4}-(\\d|[]){2}-(\\d|[]){2}(\\s)*((\\d|[]){2}:(\\d|[]){2}:(\\d|[]){2})?)",
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
for (int i = 0; i < terms.size(); i++) {
boolean isTime = false;
Term termBase = terms.get(i);
int timeTermsLength = 1;
int matchLength = 0; //匹配长度
for (int j = i; j < terms.size() && matchLength < 11; j++) { //向后最大找14个词匹配是否是时间词
Term term = terms.get(j);
name = term.getName();
timeWord += name;
Matcher matcher = pattern.matcher(timeWord);
mergeList.add(term);
if (matcher.matches()) {
isTime = true;
timeTermsLength += (j - i);
i = j;
}
matchLength++;
}
if (isTime) {
Term ft = mergeList.pollFirst();
for (int k = 0; k < timeTermsLength - 1; k++) {
ft.merageWithBlank(mergeList.get(k));
}
ft.setNature(nature);
list.add(ft);
} else {
list.add(termBase);
}
mergeList.clear();
timeWord = "";
}
result.setTerms(list);
}
}

View File

@ -1,85 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.recognition.impl;
import org.ansj.domain.Nature;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
/**
*
*
* @author ansj
*
*/
public class UserDicNatureRecognition implements Recognition {
/**
*
*/
private static final long serialVersionUID = 1L;
private Forest[] forests = null;
public UserDicNatureRecognition() {
forests = new Forest[] {DicLibrary.get()};
}
/**
*
*
* @param forests
*/
public UserDicNatureRecognition(Forest... forests) {
this.forests = forests;
}
@Override
public void recognition(Result result) {
for (Term term : result) {
for (int i = forests.length - 1; i > -1; i--) {
String[] params = getParams(forests[i], term.getName());
if (params != null) {
term.setNature(new Nature(params[0]));
break;
}
}
}
}
public static String[] getParams(Forest forest, String word) {
SmartForest<String[]> temp = forest;
for (int i = 0; i < word.length(); i++) {
temp = temp.get(word.charAt(i));
if (temp == null) {
return null;
}
}
if (temp.getStatus() > 1) {
return temp.getParam();
} else {
return null;
}
}
}

View File

@ -1,353 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.AmbiguityLibrary;
import org.ansj.library.DicLibrary;
import org.ansj.splitWord.impl.GetWordsImpl;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.GetWord;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.WordAlert;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import static org.ansj.library.DATDictionary.status;
/**
* +
*
* @author ansj
*
*/
public abstract class Analysis {
/**
*
*/
public int offe;
/**
*
*/
private GetWordsImpl gwi = new GetWordsImpl();
protected Forest[] forests = null;
private Forest ambiguityForest = AmbiguityLibrary.get();
// 是否开启人名识别
protected Boolean isNameRecognition = true;
// 是否开启数字识别
protected Boolean isNumRecognition = true;
// 是否数字和量词合并
protected Boolean isQuantifierRecognition = true;
// 是否显示真实词语
protected Boolean isRealName = false;
/**
*
*/
private AnsjReader br;
protected Analysis() {
this.forests = new Forest[] {DicLibrary.get()};
this.isNameRecognition = MyStaticValue.isNameRecognition;
this.isNumRecognition = MyStaticValue.isNumRecognition;
this.isQuantifierRecognition = MyStaticValue.isQuantifierRecognition;
this.isRealName = MyStaticValue.isRealName;
};
private LinkedList<Term> terms = new LinkedList<>();
/**
* while .null
*
* @return
* @throws IOException
*/
public Term next() throws IOException {
Term term = null;
if (!terms.isEmpty()) {
term = terms.poll();
term.updateOffe(offe);
return term;
}
String temp = br.readLine();
offe = br.getStart();
while (StringUtil.isBlank(temp)) {
if (temp == null) {
return null;
} else {
temp = br.readLine();
}
}
// 歧异处理字符串
fullTerms(temp);
if (!terms.isEmpty()) {
term = terms.poll();
term.updateOffe(offe);
return term;
}
return null;
}
/**
* terms
*/
private void fullTerms(String temp) {
List<Term> result = analysisStr(temp);
terms.addAll(result);
}
/**
* ,
*
* @param temp
* @return
*/
private List<Term> analysisStr(String temp) {
Graph gp = new Graph(temp);
int startOffe = 0;
if (this.ambiguityForest != null) {
GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
String[] params = null;
while ((gw.getFrontWords()) != null) {
if (gw.offe > startOffe) {
analysis(gp, startOffe, gw.offe);
}
params = gw.getParams();
startOffe = gw.offe;
for (int i = 0; i < params.length; i += 2) {
gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
startOffe += params[i].length();
}
}
}
if (startOffe < gp.chars.length) {
analysis(gp, startOffe, gp.chars.length);
}
List<Term> result = this.getResult(gp);
return result;
}
private void analysis(Graph gp, int startOffe, int endOffe) {
int start = 0;
int end = 0;
char[] chars = gp.chars;
String str = null;
for (int i = startOffe; i < endOffe; i++) {
switch (status(chars[i])) {
case 4:
start = i;
end = 1;
while (++i < endOffe && status(chars[i]) == 4) {
end++;
}
str = WordAlert.alertEnglish(chars, start, end);
gp.addTerm(new Term(str, start, TermNatures.EN));
i--;
break;
case 5:
start = i;
end = 1;
while (++i < endOffe && status(chars[i]) == 5) {
end++;
}
str = WordAlert.alertNumber(chars, start, end);
gp.addTerm(new Term(str, start, TermNatures.M));
i--;
break;
default:
start = i;
end = i;
int status = 0;
do {
end = ++i;
if (i >= endOffe) {
break;
}
status = status(chars[i]);
} while (status < 4);
if (status > 3) {
i--;
}
gwi.setChars(chars, start, end);
int max = start;
while ((str = gwi.allWords()) != null) {
Term term = new Term(str, gwi.offe, gwi.getItem());
int len = term.getOffe() - max;
if (len > 0) {
for (; max < term.getOffe();) {
gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
max++;
}
}
gp.addTerm(term);
max = term.toValue();
}
int len = end - max;
if (len > 0) {
for (; max < end;) {
gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
max++;
}
}
break;
}
}
}
/**
*
*
* @param gp
* @param result
*/
protected void setRealName(Graph graph, List<Term> result) {
if (!MyStaticValue.isRealName) {
return;
}
String str = graph.realStr;
for (Term term : result) {
term.setRealName(str.substring(term.getOffe(), term.getOffe() + term.getName().length()));
}
}
/**
*
*
* @param temp
* @return
*/
public Result parseStr(String temp) {
return new Result(analysisStr(temp));
}
/**
* reader
*
* @return
* @throws IOException
*/
public Result parse() throws IOException {
List<Term> list = new ArrayList<>();
Term temp = null;
while ((temp = next()) != null) {
list.add(temp);
}
Result result = new Result(list);
return result;
}
protected abstract List<Term> getResult(Graph graph);
public abstract class Merger {
public abstract List<Term> merger();
}
/**
*
*
* @param br
*/
public void resetContent(AnsjReader br) {
this.offe = 0;
this.br = br;
}
public void resetContent(Reader reader) {
this.offe = 0;
this.br = new AnsjReader(reader);
}
public void resetContent(Reader reader, int buffer) {
this.offe = 0;
this.br = new AnsjReader(reader, buffer);
}
public Forest getAmbiguityForest() {
return ambiguityForest;
}
public Analysis setAmbiguityForest(Forest ambiguityForest) {
this.ambiguityForest = ambiguityForest;
return this;
}
public Analysis setForests(Forest... forests) {
this.forests = forests;
return this;
}
public Analysis setIsNameRecognition(Boolean isNameRecognition) {
this.isNameRecognition = isNameRecognition;
return this;
}
public Analysis setIsNumRecognition(Boolean isNumRecognition) {
this.isNumRecognition = isNumRecognition;
return this;
}
public Analysis setIsQuantifierRecognition(Boolean isQuantifierRecognition) {
this.isQuantifierRecognition = isQuantifierRecognition;
return this;
}
public Analysis setIsRealName(Boolean isRealName) {
this.isRealName = isRealName;
return this;
}
}

View File

@ -1,49 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord;
public interface GetWords {
/**
*
*
* @param str
*
* @return
*/
public String allWords();
/**
*
*
* @param temp
*
*/
public void setStr(String temp);
/**
*
* @return
*/
public void setChars(char[] chars, int start, int end);
public int getOffe();
}

View File

@ -1,75 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord.analysis;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
* ..ngram..
*
* @author ansj
*
*/
public class BaseAnalysis extends Analysis {
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
return getResult();
}
private List<Term> getResult() {
List<Term> result = new ArrayList<>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
public BaseAnalysis() {};
public BaseAnalysis(Reader reader) {
super.resetContent(new AnsjReader(reader));
}
public static Result parse(String str) {
return new BaseAnalysis().parseStr(str);
}
}

View File

@ -1,153 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord.analysis;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.arrimpl.NumRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.GetWord;
import org.nlpcn.commons.lang.tire.domain.Forest;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class DicAnalysis extends Analysis {
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
if (forests == null) {
return;
}
int beginOff = graph.terms[0].getOffe();
Forest forest = null;
for (int i = forests.length - 1; i >= 0; i--) {
forest = forests[i];
if (forest == null) {
continue;
}
GetWord word = forest.getWord(graph.chars);
String temp = null;
int tempFreq = 50;
while ((temp = word.getAllWords()) != null) {
if (graph.terms[word.offe] == null) {
continue;
}
tempFreq = getInt(word.getParam()[1], 50);
Term term = new Term(temp, beginOff + word.offe, word.getParam()[0], tempFreq);
term.selfScore(-1 * Math.pow(Math.log(tempFreq), temp.length()));
TermUtil.insertTerm(graph.terms, term, InsertTermType.REPLACE);
}
}
graph.rmLittlePath();
graph.walkPathByScore();
graph.rmLittlePath();
}
private int getInt(String str, int def) {
try {
return Integer.parseInt(str);
} catch (NumberFormatException e) {
return def;
}
}
private List<Term> getResult() {
List<Term> result = new ArrayList<>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
public DicAnalysis() {
super();
}
public DicAnalysis(Reader reader) {
super.resetContent(new AnsjReader(reader));
}
public static Result parse(String str) {
return new DicAnalysis().parseStr(str);
}
public static Result parse(String str, Forest... forests) {
return new DicAnalysis().setForests(forests).parseStr(str);
}
}

View File

@ -1,163 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord.analysis;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.arrimpl.NumRecognition;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.GetWord;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.ObjConver;
import java.io.Reader;
import java.util.*;
/**
*
*
* @author ansj
*
*/
public class IndexAnalysis extends Analysis {
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return result();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
/**
*
*
* @return
*/
private List<Term> result() {
String temp = null;
Set<String> set = new HashSet<>();
List<Term> result = new LinkedList<>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
set.add(graph.terms[i].getName() + graph.terms[i].getOffe());
}
}
LinkedList<Term> last = new LinkedList<>();
char[] chars = graph.chars;
if (forests != null) {
for (Forest forest : forests) {
if (forest == null) {
continue;
}
GetWord word = forest.getWord(chars);
while ((temp = word.getAllWords()) != null) {
if (!set.contains(temp + word.offe)) {
set.add(temp + word.offe);
last.add(new Term(temp, word.offe, word.getParam(0),
ObjConver.getIntValue(word.getParam(1))));
}
}
}
}
result.addAll(last);
Collections.sort(result, new Comparator<Term>() {
@Override
public int compare(Term o1, Term o2) {
if (o1.getOffe() == o2.getOffe()) {
return o2.getName().length() - o1.getName().length();
} else {
return o1.getOffe() - o2.getOffe();
}
}
});
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
public IndexAnalysis() {
super();
}
public IndexAnalysis(Reader reader) {
super.resetContent(new AnsjReader(reader));
}
public static Result parse(String str) {
return new IndexAnalysis().parseStr(str);
}
public static Result parse(String str, Forest... forests) {
return new IndexAnalysis().setForests(forests).parseStr(str);
}
}

View File

@ -1,288 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord.analysis;
import org.ansj.app.crf.SplitWord;
import org.ansj.dic.LearnTool;
import org.ansj.domain.*;
import org.ansj.library.CrfLibrary;
import org.ansj.recognition.arrimpl.*;
import org.ansj.recognition.impl.NatureRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.MapCount;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* ,
*
* @author ansj
*
*/
public class NlpAnalysis extends Analysis {
private static final Log LOG = LogFactory.getLog(NlpAnalysis.class);
private LearnTool learn = null;
private static final String TAB = "\t";
private static final int CRF_WEIGHT = 6;
private SplitWord splitWord = CrfLibrary.get();
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
if (learn == null) {
learn = new LearnTool();
}
graph.walkPath();
learn.learn(graph, splitWord, forests);
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
if (splitWord != null) {
MapCount<String> mc = new MapCount<>();
// 通过crf分词
List<String> words = splitWord.cut(graph.chars);
Term tempTerm = null;
int tempOff = 0;
if (!words.isEmpty()) {
String word = words.get(0);
if (!isRuleWord(word)) {
mc.add("始##始" + TAB + word, CRF_WEIGHT);
}
}
for (String word : words) {
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word); // 尝试从词典获取词性
Term term = null;
if (termNatures != TermNatures.NULL) {
term = new Term(word, tempOff, termNatures);
} else {
term = new Term(word, tempOff, TermNatures.NW);
term.setNewWord(true);
}
tempOff += word.length(); // 增加偏移量
if (isRuleWord(word)) { // 如果word不对那么不要了
tempTerm = null;
continue;
}
if (term.isNewWord()) { // 尝试猜测词性
termNatures = NatureRecognition.guessNature(word);
term.updateTermNaturesAndNature(termNatures);
}
TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
// 对于非词典中的词持有保守态度
if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
}
tempTerm = term;
if (term.isNewWord()) {
learn.addTerm(new NewWord(word, Nature.NW));
}
}
if (tempTerm != null && !tempTerm.isNewWord()) {
mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
}
graph.walkPath(mc.get());
} else {
LOG.warn("not find any crf model, make sure your config right? ");
}
// 数字发现
if (graph.hasNum && isNumRecognition) {
new NumRecognition().recognition(graph.terms);
}
// 词性标注
List<Term> result = getResult();
// 用户自定义词典的识别
new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
// 进行新词发现
new NewWordRecognition(learn).recognition(graph.terms);
graph.walkPathByScore();
// 优化后重新获得最优路径
result = getResult();
// 激活辞典
for (Term term : result) {
learn.active(term.getName());
}
setRealName(graph, result);
return result;
}
private List<Term> getResult() {
List<Term> result = new ArrayList<>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] == null) {
continue;
}
result.add(graph.terms[i]);
}
return result;
}
};
return merger.merger();
}
// 临时处理新词中的特殊字符
private static final Set<Character> filter = new HashSet<>();
static {
filter.add(':');
filter.add(' ');
filter.add('');
filter.add(' ');
filter.add('');
filter.add('”');
filter.add('“');
filter.add('');
filter.add('。');
filter.add('');
filter.add('。');
filter.add(',');
filter.add('.');
filter.add('、');
filter.add('\\');
filter.add('');
filter.add(';');
filter.add('');
filter.add('?');
filter.add('!');
filter.add('\"');
filter.add('');
filter.add('');
filter.add('(');
filter.add(')');
filter.add('…');
filter.add('…');
filter.add('—');
filter.add('-');
filter.add('');
filter.add('—');
filter.add('《');
filter.add('》');
}
/**
*
*
* @param word
* @return
*/
public static boolean isRuleWord(String word) {
char c = 0;
for (int i = 0; i < word.length(); i++) {
c = word.charAt(i);
if (c != '·') {
if (c < 256 || filter.contains(c) || (c = WordAlert.CharCover(word.charAt(i))) > 0) {
return true;
}
}
}
return false;
}
public NlpAnalysis setCrfModel(SplitWord splitWord) {
this.splitWord = splitWord;
return this;
}
public NlpAnalysis setLearnTool(LearnTool learn) {
this.learn = learn;
return this;
}
public NlpAnalysis() {
super();
}
public NlpAnalysis(Reader reader) {
super.resetContent(new AnsjReader(reader));
}
public static Result parse(String str) {
return new NlpAnalysis().parseStr(str);
}
public static Result parse(String str, Forest... forests) {
return new NlpAnalysis().setForests(forests).parseStr(str);
}
}

View File

@ -1,116 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord.analysis;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.arrimpl.NumRecognition;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.domain.Forest;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
*
*
* @author ansj
*
*/
public class ToAnalysis extends Analysis {
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
private List<Term> getResult() {
List<Term> result = new ArrayList<>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
public ToAnalysis() {
super();
}
public ToAnalysis(Reader reader) {
super.resetContent(new AnsjReader(reader));
}
public static Result parse(String str) {
return new ToAnalysis().parseStr(str);
}
public static Result parse(String str, Forest... forests) {
return new ToAnalysis().setForests(forests).parseStr(str);
}
}

View File

@ -1,149 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.splitWord.impl;
import org.ansj.domain.AnsjItem;
import org.ansj.library.DATDictionary;
import org.ansj.splitWord.GetWords;
public class GetWordsImpl implements GetWords {
/**
* offe :
*/
public int offe;
/**
* ,setStr() ;
*/
public GetWordsImpl(String str) {
setStr(str);
}
/**
*
*/
public GetWordsImpl() {}
int charsLength = 0;
@Override
public void setStr(String str) {
setChars(str.toCharArray(), 0, str.length());
}
@Override
public void setChars(char[] chars, int start, int end) {
this.chars = chars;
i = start;
this.start = start;
charsLength = end;
checkValue = 0;
}
public char[] chars;
private int charHashCode;
private int start = 0;
public int end = 0;
private int baseValue = 0;
private int checkValue = 0;
private int tempBaseValue = 0;
public int i = 0;
private String str = null;
@Override
public String allWords() {
for (; i < charsLength; i++) {
charHashCode = chars[i];
end++;
switch (getStatement()) {
case 0:
if (baseValue == chars[i]) {
str = String.valueOf(chars[i]);
offe = i;
start = ++i;
end = 0;
baseValue = 0;
tempBaseValue = baseValue;
return str;
} else {
int startCharStatus = DATDictionary.getItem(chars[start]).getStatus();
if (startCharStatus == 1) { //如果start的词的status为1则将start设为i否则start加1
start = i;
i--;
end = 0;
baseValue = 0;
} else {
i = start;
start++;
end = 0;
baseValue = 0;
}
break;
}
case 2:
i++;
offe = start;
tempBaseValue = baseValue;
return DATDictionary.getItem(tempBaseValue).getName();
case 3:
offe = start;
start++;
i = start;
end = 0;
tempBaseValue = baseValue;
baseValue = 0;
return DATDictionary.getItem(tempBaseValue).getName();
}
}
end = 0;
baseValue = 0;
i = 0;
return null;
}
/**
* c. 0. 1. 2. 3.
*
* @param c
* @return
*/
private int getStatement() {
checkValue = baseValue;
baseValue = DATDictionary.getItem(checkValue).getBase() + charHashCode;
if (baseValue < DATDictionary.arrayLength && (DATDictionary.getItem(baseValue).getCheck() == checkValue
|| DATDictionary.getItem(baseValue).getCheck() == -1)) {
return DATDictionary.getItem(baseValue).getStatus();
}
return 0;
}
public AnsjItem getItem() {
return DATDictionary.getItem(tempBaseValue);
}
@Override
public int getOffe() {
return offe;
}
}

View File

@ -1,240 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
import java.io.IOException;
import java.io.Reader;
/**
* jdk... 线,西 reader
* \r\n .start
*
* @author ansj
*
*/
public class AnsjReader extends Reader {
private Reader in;
private char cb[];
private static int defaultCharBufferSize = 8192;
/**
* Creates a buffering character-input stream that uses an input buffer of
* the specified size.
*
* @param in
* A Reader
* @param sz
* Input-buffer size
*
* @exception IllegalArgumentException
* If {@code sz <= 0}
*/
public AnsjReader(Reader in, int sz) {
super(in);
if (sz <= 0)
throw new IllegalArgumentException("Buffer size <= 0");
this.in = in;
cb = new char[sz];
}
/**
* Creates a buffering character-input stream that uses a default-sized
* input buffer.
*
* @param in
* A Reader
*/
public AnsjReader(Reader in) {
this(in, defaultCharBufferSize);
}
/** Checks to make sure that the stream has not been closed */
private void ensureOpen() throws IOException {
if (in == null)
throw new IOException("Stream closed");
}
/**
*
*/
@Override
public int read(char cbuf[], int off, int len) throws IOException {
throw new IOException("AnsjBufferedReader not support this interface! ");
}
private int start = 0;
private int tempStart = 0;
/**
* ps \n \r
*/
public String readLine() throws IOException {
ensureOpen();
StringBuilder sb = null;
start = tempStart;
firstRead = true;
while (true) {
tempLen = 0;
ok = false;
readString();
// if (tempLen != 0)
// System.out.println(new String(cb, tempOffe, tempLen));
if (!isRead && (tempLen == 0 || len == 0)) {
if (sb != null) {
return sb.toString();
}
return null;
}
if (!isRead) { // 如果不是需要读状态,那么返回
tempStart += tempLen;
if (sb == null) {
return new String(cb, tempOffe, tempLen);
} else {
sb.append(cb, tempOffe, tempLen);
return sb.toString();
}
}
if (tempLen == 0) {
continue;
}
// 如果是需要读状态那么读取
if (sb == null) {
sb = new StringBuilder();
}
sb.append(cb, tempOffe, tempLen);
tempStart += tempLen;
}
}
int offe = 0;
int len = 0;
boolean isRead = false;
boolean ok = false;
boolean firstRead = true;
int tempOffe;
int tempLen;
private void readString() throws IOException {
if (offe <= 0) {
if (offe == -1) {
isRead = false;
return;
}
len = in.read(cb);
if (len <= 0) { // 说明到结尾了
isRead = false;
return;
}
}
isRead = true;
char c = 0;
int i = offe;
for (; i < len; i++) {
c = cb[i];
if (c != '\r' && c != '\n') {
break;
}
if (!firstRead) {
i++;
tempStart++;
offe = i;
tempOffe = offe;
isRead = false;
return;
}
tempStart++;
start++;
}
if (i == len) {
isRead = true;
offe = 0;
return;
}
firstRead = false;
offe = i;
for (; i < len; i++) {
c = cb[i];
if (c == '\n' || c == '\r') {
isRead = false;
break;
}
}
tempOffe = offe;
tempLen = i - offe;
if (i == len) {
if (len < cb.length) { // 说明到结尾了
isRead = false;
offe = -1;
} else {
offe = 0;
}
} else {
offe = i;
}
}
@Override
public void close() throws IOException {
synchronized (lock) {
if (in == null)
return;
try {
in.close();
} finally {
in = null;
cb = null;
}
}
}
public int getStart() {
return this.start;
}
}

View File

@ -1,372 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
import org.ansj.domain.AnsjItem;
import org.ansj.domain.Term;
import org.ansj.domain.TermNatures;
import org.ansj.library.DATDictionary;
import org.ansj.splitWord.Analysis.Merger;
import org.ansj.util.TermUtil.InsertTermType;
import java.util.List;
import java.util.Map;
/**
*
*
* @author ansj
*
*/
public class Graph {
public char[] chars = null;
public String realStr = null;
public Term[] terms = null;
protected Term end = null;
protected Term root = null;
protected static final String E = "末##末";
protected static final String B = "始##始";
// 是否有人名
public boolean hasPerson;
// 是否有数字
public boolean hasNum;
// 是否需有歧异
public Graph(String str) {
realStr = str;
this.chars = str.toCharArray();
terms = new Term[chars.length + 1];
end = new Term(E, chars.length, AnsjItem.END);
root = new Term(B, -1, AnsjItem.BEGIN);
terms[chars.length] = end;
}
/**
*
*/
public List<Term> getResult(Merger merger) {
return merger.merger();
}
/**
*
*
* @param term
*/
public void addTerm(Term term) {
// 是否有数字
if (!hasNum && term.termNatures().numAttr.numFreq > 0) {
hasNum = true;
}
// 是否有人名
if (!hasPerson && term.termNatures().personAttr.flag) {
hasPerson = true;
}
TermUtil.insertTerm(terms, term, InsertTermType.REPLACE);
}
/**
* root Term
*
* @return
*/
protected Term optimalRoot() {
Term to = end;
to.clearScore();
Term from = null;
while ((from = to.from()) != null) {
for (int i = from.getOffe() + 1; i < to.getOffe(); i++) {
terms[i] = null;
}
if (from.getOffe() > -1) {
terms[from.getOffe()] = from;
}
// 断开横向链表.节省内存
from.setNext(null);
from.setTo(to);
from.clearScore();
to = from;
}
return root;
}
/**
*
*/
public void rmLittlePath() {
int maxTo = -1;
Term temp = null;
Term maxTerm = null;
// 是否有交叉
boolean flag = false;
final int length = terms.length - 1;
for (int i = 0; i < length; i++) {
maxTerm = getMaxTerm(i);
if (maxTerm == null)
continue;
maxTo = maxTerm.toValue();
/**
* .....null..
*/
switch (maxTerm.getName().length()) {
case 1:
continue;
case 2:
if (terms[i + 1] == null) {
i = i + 1;
continue;
}
}
/**
*
*/
for (int j = i + 1; j < maxTo; j++) {
temp = getMaxTerm(j);
if (temp == null) {
continue;
}
if (maxTo < temp.toValue()) {
maxTo = temp.toValue();
flag = true;
}
}
if (flag) {
i = maxTo - 1;
flag = false;
} else {
maxTerm.setNext(null);
terms[i] = maxTerm;
for (int j = i + 1; j < maxTo; j++) {
terms[j] = null;
}
// FIXME: 这里理论上得设置。但是跑了这么久,还不发生错误。应该是不依赖于双向链接。需要确认下。这段代码是否有用
// //将下面的to的from设置回来
// temp = terms[i+maxTerm.getName().length()] ;
// do{
// temp.setFrom(maxTerm) ;
// }while((temp=temp.next())!=null) ;
}
}
}
/**
* term,term
*
* @param i
* @return
*/
private Term getMaxTerm(int i) {
Term maxTerm = terms[i];
if (maxTerm == null) {
return null;
}
Term term = maxTerm;
while ((term = term.next()) != null) {
maxTerm = term;
}
return maxTerm;
}
/**
* ,viterbi
*/
public void rmLittleSinglePath() {
int maxTo = -1;
Term temp = null;
for (int i = 0; i < terms.length; i++) {
if (terms[i] == null)
continue;
maxTo = terms[i].toValue();
if (maxTo - i == 1 || i + 1 == terms.length)
continue;
for (int j = i; j < maxTo; j++) {
temp = terms[j];
if (temp != null && temp.toValue() <= maxTo && temp.getName().length() == 1) {
terms[j] = null;
}
}
}
}
/**
*
*/
public void rmLittlePathByScore() {
int maxTo = -1;
Term temp = null;
for (int i = 0; i < terms.length; i++) {
if (terms[i] == null) {
continue;
}
Term maxTerm = null;
double maxScore = 0;
Term term = terms[i];
// 找到自身分数对大最长的
do {
if (maxTerm == null || maxScore > term.score()) {
maxTerm = term;
} else if (maxScore == term.score() && maxTerm.getName().length() < term.getName().length()) {
maxTerm = term;
}
} while ((term = term.next()) != null);
term = maxTerm;
do {
maxTo = term.toValue();
maxScore = term.score();
if (maxTo - i == 1 || i + 1 == terms.length)
continue;
boolean flag = true;// 可以删除
out: for (int j = i; j < maxTo; j++) {
temp = terms[j];
if (temp == null) {
continue;
}
do {
if (temp.toValue() > maxTo || temp.score() < maxScore) {
flag = false;
break out;
}
} while ((temp = temp.next()) != null);
}
// 验证通过可以删除了
if (flag) {
for (int j = i + 1; j < maxTo; j++) {
terms[j] = null;
}
}
} while ((term = term.next()) != null);
}
}
public void walkPathByScore() {
Term term = null;
// BEGIN先行打分
mergerByScore(root, 0);
// 从第一个词开始往后打分
for (int i = 0; i < terms.length; i++) {
term = terms[i];
while (term != null && term.from() != null && term != end) {
int to = term.toValue();
mergerByScore(term, to);
term = term.next();
}
}
optimalRoot();
}
public void walkPath() {
walkPath(null);
}
/**
*
*
* @param relationMap
*/
public void walkPath(Map<String, Double> relationMap) {
Term term = null;
// BEGIN先行打分
merger(root, 0, relationMap);
// 从第一个词开始往后打分
for (int i = 0; i < terms.length; i++) {
term = terms[i];
while (term != null && term.from() != null && term != end) {
int to = term.toValue();
merger(term, to, relationMap);
term = term.next();
}
}
optimalRoot();
}
/**
*
*
* @param i
* @param j
* @param to
*/
private void merger(Term fromTerm, int to, Map<String, Double> relationMap) {
Term term = null;
if (terms[to] != null) {
term = terms[to];
while (term != null) {
// 关系式to.set(from)
term.setPathScore(fromTerm, relationMap);
term = term.next();
}
} else {
char c = chars[to];
TermNatures tn = DATDictionary.getItem(c).termNatures;
if (tn == null || tn == TermNatures.NULL) {
tn = TermNatures.NULL;
}
terms[to] = new Term(String.valueOf(c), to, tn);
terms[to].setPathScore(fromTerm, relationMap);
}
}
/**
*
*
* @param i
* @param j
* @param to
*/
private void mergerByScore(Term fromTerm, int to) {
Term term = null;
if (terms[to] != null) {
term = terms[to];
while (term != null) {
// 关系式to.set(from)
term.setPathSelfScore(fromTerm);
term = term.next();
}
}
}
/**
* graph
*/
public void printGraph() {
for (Term term : terms) {
if (term == null) {
continue;
}
System.out.print(term.getName() + "\t" + term.score() + " ,");
while ((term = term.next()) != null) {
System.out.print(term + "\t" + term.score() + " ,");
}
System.out.println();
}
}
}

View File

@ -1,108 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
import org.ansj.domain.Term;
import org.ansj.library.NatureLibrary;
import org.ansj.library.NgramLibrary;
import org.ansj.recognition.impl.NatureRecognition.NatureTerm;
import java.util.Map;
public class MathUtil {
// 平滑参数
private static final double D_SMOOTHING_PARA = 0.1;
// 分隔符我最喜欢的
private static final String TAB = "\t";
// 一个参数
private static final int MAX_FREQUENCE = 2079997;// 7528283+329805;
// Two linked Words frequency
private static final double D_TEMP = (double) 1 / MAX_FREQUENCE;
/**
*
*
* @param form
*
* @param to
*
* @return
*/
public static double compuScore(Term from, Term to, Map<String, Double> relationMap) {
double frequency = from.termNatures().allFreq + 1;
if (frequency < 0) {
double score = from.score() + MAX_FREQUENCE;
from.score(score);
return score;
}
double nTwoWordsFreq = NgramLibrary.getTwoWordFreq(from, to);
if (relationMap != null) {
Double d = relationMap.get(from.getName() + TAB + to.getName());
if (d != null) {
nTwoWordsFreq += d;
}
}
double value = -Math.log(D_SMOOTHING_PARA * frequency / (MAX_FREQUENCE + 80000)
+ (1 - D_SMOOTHING_PARA) * ((1 - D_TEMP) * nTwoWordsFreq / frequency + D_TEMP));
if (value < 0) {
value += frequency;
}
return from.score() + value;
}
/**
* .
*
* @param from
* @param term
* @return
*/
public static double compuScoreFreq(Term from, Term term) {
return from.termNatures().allFreq + term.termNatures().allFreq;
}
/**
*
*
* @param from
* @param to
* @return
*/
public static double compuNatureFreq(NatureTerm from, NatureTerm to) {
double twoWordFreq = NatureLibrary.getTwoNatureFreq(from.termNature.nature, to.termNature.nature);
if (twoWordFreq == 0) {
twoWordFreq = Math.log(from.selfScore + to.selfScore);
}
double score = from.score + Math.log((from.selfScore + to.selfScore) * twoWordFreq) + to.selfScore;
return score;
}
public static void main(String[] args) {
System.out.println(Math.log(D_TEMP * 2));
}
}

View File

@ -1,80 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
public class MatrixUtil {
/**
*
*
* @param dbs
* @return
*/
public static double sum(double[] dbs) {
double value = 0;
for (double d : dbs) {
value += d;
}
return value;
}
public static int sum(int[] dbs) {
int value = 0;
for (int d : dbs) {
value += d;
}
return value;
}
public static double sum(double[][] w) {
double value = 0;
for (double[] dbs : w) {
value += sum(dbs);
}
return value;
}
public static void dot(double[] feature, double[] feature1) {
if (feature1 == null) {
return;
}
for (int i = 0; i < feature1.length; i++) {
feature[i] += feature1[i];
}
}
public static void dot(float[] feature, float[] feature1) {
if (feature1 == null) {
return;
}
if (feature == null) {
return;
}
int min = Math.min(feature.length, feature1.length);
for (int i = 0; i < min; i++) {
feature[i] += feature1[i];
}
}
}

View File

@ -1,389 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
import org.ansj.app.crf.SplitWord;
import org.ansj.dic.DicReader;
import org.ansj.dic.impl.Jdbc2Stream;
import org.ansj.domain.AnsjItem;
import org.ansj.exception.LibraryException;
import org.ansj.library.*;
import org.ansj.recognition.impl.StopRecognition;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.FileFinder;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.ObjConver;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import java.io.*;
import java.lang.reflect.Field;
import java.util.HashMap;
import java.util.Map;
import java.util.PropertyResourceBundle;
import java.util.ResourceBundle;
/**
* .
*
* @author ansj
*
*/
public class MyStaticValue {
public static final Log LOG = LogFactory.getLog(MyStaticValue.class);
// 是否开启人名识别
public static Boolean isNameRecognition = true;
// 是否开启数字识别
public static Boolean isNumRecognition = true;
// 是否数字和量词合并
public static Boolean isQuantifierRecognition = true;
// 是否显示真实词语
public static Boolean isRealName = false;
/**
*
*/
public static boolean isSkipUserDefine = false;
public static final Map<String, String> ENV = new HashMap<>();
static {
/**
*
*/
ResourceBundle rb = null;
try {
rb = ResourceBundle.getBundle("ansj_library");
} catch (Exception e) {
try {
File find = FileFinder.find("ansj_library.properties", 1);
if (find != null && find.isFile()) {
rb = new PropertyResourceBundle(
IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding")));
LOG.info("load ansj_library not find in classPath ! i find it in " + find.getAbsolutePath()
+ " make sure it is your config!");
}
} catch (Exception e1) {
LOG.warn("not find ansj_library.properties. reason: " + e1.getMessage());
}
}
if (rb == null) {
try {
rb = ResourceBundle.getBundle("library");
} catch (Exception e) {
try {
File find = FileFinder.find("library.properties", 2);
if (find != null && find.isFile()) {
rb = new PropertyResourceBundle(
IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding")));
LOG.info("load library not find in classPath ! i find it in " + find.getAbsolutePath()
+ " make sure it is your config!");
}
} catch (Exception e1) {
LOG.warn("not find library.properties. reason: " + e1.getMessage());
}
}
}
if (rb == null) {
LOG.warn("not find library.properties in classpath use it by default !");
} else {
for (String key : rb.keySet()) {
ENV.put(key, rb.getString(key));
try {
String value = rb.getString(key);
if (value.startsWith("jdbc:")) { //给jdbc窜中密码做一个加密,不让密码明文在日志中
value = Jdbc2Stream.encryption(value);
}
LOG.info("init " + key + " to env value is : " + value);
Field field = MyStaticValue.class.getField(key);
field.set(null, ObjConver.conversion(rb.getString(key), field.getType()));
} catch (Exception e) {
}
}
}
}
/**
*
*
* @return
*/
public static BufferedReader getPersonReader() {
return DicReader.getReader("person/person.dic");
}
/**
*
*
* @return
*/
public static BufferedReader getCompanReader() {
return DicReader.getReader("company/company.data");
}
/**
*
*
* @return
*/
public static BufferedReader getNewWordReader() {
return DicReader.getReader("newWord/new_word_freq.dic");
}
/**
*
*
* @return
*/
public static BufferedReader getArraysReader() {
return DicReader.getReader("arrays.dic");
}
/**
*
*
* @return
*/
public static BufferedReader getNumberReader() {
return DicReader.getReader("numberLibrary.dic");
}
/**
*
*
* @return
*/
public static BufferedReader getEnglishReader() {
return DicReader.getReader("englishLibrary.dic");
}
/**
*
*
* @return
*/
public static BufferedReader getNatureMapReader() {
return DicReader.getReader("nature/nature.map");
}
/**
*
*
* @return
*/
public static BufferedReader getNatureTableReader() {
return DicReader.getReader("nature/nature.table");
}
/**
*
*
* @return
*/
public static BufferedReader getNatureClassSuffix() {
return DicReader.getReader("nature_class_suffix.txt");
}
/**
*
*
* @return
*/
public static BufferedReader getPersonFreqReader() {
return DicReader.getReader("person/name_freq.dic");
}
/**
*
*
* @return
*/
@SuppressWarnings("unchecked")
public static Map<String, int[][]> getPersonFreqMap() {
Map<String, int[][]> map = new HashMap<>(0);
try (InputStream inputStream = DicReader.getInputStream("person/asian_name_freq.data")) {
ObjectInputStream objectInputStream = new ObjectInputStream(inputStream);
map = (Map<String, int[][]>) objectInputStream.readObject();
} catch (IOException e) {
LOG.warn("IO异常", e);
} catch (ClassNotFoundException e) {
LOG.warn("找不到类", e);
}
return map;
}
/**
*
*
* @return
*/
public static void initBigramTables() {
try (BufferedReader reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8")) {
String temp = null;
String[] strs = null;
int freq = 0;
while ((temp = reader.readLine()) != null) {
if (StringUtil.isBlank(temp)) {
continue;
}
strs = temp.split("\t");
freq = Integer.parseInt(strs[1]);
strs = strs[0].split("@");
AnsjItem fromItem = DATDictionary.getItem(strs[0]);
AnsjItem toItem = DATDictionary.getItem(strs[1]);
if (fromItem == AnsjItem.NULL && strs[0].contains("#")) {
fromItem = AnsjItem.BEGIN;
}
if (toItem == AnsjItem.NULL && strs[1].contains("#")) {
toItem = AnsjItem.END;
}
if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) {
continue;
}
if (fromItem.bigramEntryMap == null) {
fromItem.bigramEntryMap = new HashMap<Integer, Integer>();
}
fromItem.bigramEntryMap.put(toItem.getIndex(), freq);
}
} catch (NumberFormatException e) {
LOG.warn("数字格式异常", e);
} catch (UnsupportedEncodingException e) {
LOG.warn("不支持的编码", e);
} catch (IOException e) {
LOG.warn("IO异常", e);
}
}
/*
*
*/
public static Log getLog(Class<?> clazz) {
return LogFactory.getLog(clazz);
}
/**
*
*
* @param key
* @param path
* @param value
*/
public static void putLibrary(String key, String path, Object value) {
if (key.startsWith(DicLibrary.DEFAULT)) {
DicLibrary.put(key, path, (Forest) value);
} else if (key.startsWith(StopLibrary.DEFAULT)) {
StopLibrary.put(key, path, (StopRecognition) value);
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
SynonymsLibrary.put(key, path, (SmartForest) value);
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
AmbiguityLibrary.put(key, path, (Forest) value);
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
CrfLibrary.put(key, path, (SplitWord) value);
} else {
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
}
ENV.put(key, path);
}
/**
*
*
* @param key
* @param path
*/
public static void putLibrary(String key, String path) {
if (key.startsWith(DicLibrary.DEFAULT)) {
DicLibrary.put(key, path);
} else if (key.startsWith(StopLibrary.DEFAULT)) {
StopLibrary.put(key, path);
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
SynonymsLibrary.put(key, path);
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
AmbiguityLibrary.put(key, path);
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
CrfLibrary.put(key, path);
} else {
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
}
ENV.put(key, path);
}
/**
*
*
* @param key
*/
public static void removeLibrary(String key) {
if (key.startsWith(DicLibrary.DEFAULT)) {
DicLibrary.remove(key);
} else if (key.startsWith(StopLibrary.DEFAULT)) {
StopLibrary.remove(key);
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
SynonymsLibrary.remove(key);
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
AmbiguityLibrary.remove(key);
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
CrfLibrary.remove(key);
} else {
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
}
ENV.remove(key);
}
/**
*
*
* @param key
*/
public static void reloadLibrary(String key) {
if (key.startsWith(DicLibrary.DEFAULT)) {
DicLibrary.reload(key);
} else if (key.startsWith(StopLibrary.DEFAULT)) {
StopLibrary.reload(key);
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
SynonymsLibrary.reload(key);
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
AmbiguityLibrary.reload(key);
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
CrfLibrary.reload(key);
} else {
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
}
}
}

View File

@ -1,72 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
import org.ansj.domain.Term;
import org.ansj.domain.TermNatures;
import org.ansj.recognition.impl.NatureRecognition;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.WordAlert;
public class NameFix {
/**
* ,.-> fix to ! - ·
*/
public static void nameAmbiguity(Term[] terms, Forest... forests) {
Term from = null;
Term term = null;
Term next = null;
for (int i = 0; i < terms.length - 1; i++) {
term = terms[i];
if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
next = terms[i + 2];
if (next.termNatures().personAttr.split > 0) {
term.setName(term.getName() + next.getName().charAt(0));
terms[i + 2] = null;
String name = next.getName().substring(1);
terms[i + 3] = new Term(name, next.getOffe() + 1,
new NatureRecognition(forests).getTermNatures(name));
TermUtil.termLink(term, terms[i + 3]);
TermUtil.termLink(terms[i + 3], next.to());
}
}
}
// 外国人名修正
for (int i = 0; i < terms.length; i++) {
term = terms[i];
if (term != null && term.getName().length() == 1 && i > 0
&& WordAlert.CharCover(term.getName().charAt(0)) == '·') {
from = term.from();
next = term.to();
if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
from.setName(from.getName() + term.getName() + next.getName());
TermUtil.termLink(from, next.to());
terms[i] = null;
terms[i + 1] = null;
}
}
}
}
}

View File

@ -1,220 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.ansj.util;
import org.ansj.domain.Nature;
import org.ansj.domain.Term;
import org.ansj.domain.TermNatures;
import org.ansj.library.NatureLibrary;
import org.ansj.library.company.CompanyAttrLibrary;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* term
*
* @author ansj
*
*/
public class TermUtil {
/**
* termterm
*
* @param termNatures
* @return
*/
public static Term makeNewTermNum(Term from, Term to, TermNatures termNatures) {
Term term = new Term(from.getName() + to.getName(), from.getOffe(), termNatures);
term.termNatures().numAttr = from.termNatures().numAttr;
TermUtil.termLink(term, to.to());
TermUtil.termLink(term.from(), term);
return term;
}
public static void termLink(Term from, Term to) {
if (from == null || to == null)
return;
from.setTo(to);
to.setFrom(from);
}
public static enum InsertTermType {
/**
* 0
*/
SKIP,
/**
* 1
*/
REPLACE,
/**
* , 2
*/
SCORE_ADD_SORT
}
/**
* term, termtype type 0. 1. 2. ,
*
* @param terms
* @param term
*/
public static void insertTerm(Term[] terms, Term term, InsertTermType type) {
Term self = terms[term.getOffe()];
if (self == null) {
terms[term.getOffe()] = term;
return;
}
int len = term.getName().length();
// 如果是第一位置
if (self.getName().length() == len) {
if (type == InsertTermType.REPLACE) {
term.setNext(self.next());
terms[term.getOffe()] = term;
} else if (type == InsertTermType.SCORE_ADD_SORT) {
self.score(self.score() + term.score());
self.selfScore(self.selfScore() + term.selfScore());
}
return;
}
if (self.getName().length() > len) {
term.setNext(self);
terms[term.getOffe()] = term;
return;
}
Term next = self;
Term before = self;
while ((next = before.next()) != null) {
if (next.getName().length() == len) {
if (type == InsertTermType.REPLACE) {
term.setNext(next.next());
before.setNext(term);
} else if (type == InsertTermType.SCORE_ADD_SORT) {
next.score(next.score() + term.score());
next.selfScore(next.selfScore() + term.selfScore());
}
return;
} else if (next.getName().length() > len) {
before.setNext(term);
term.setNext(next);
return;
}
before = next;
}
before.setNext(term); // 如果都没有命中
}
public static void insertTermNum(Term[] terms, Term term) {
terms[term.getOffe()] = term;
}
public static void insertTerm(Term[] terms, List<Term> tempList, TermNatures nr) {
StringBuilder sb = new StringBuilder();
int offe = tempList.get(0).getOffe();
for (Term term : tempList) {
sb.append(term.getName());
terms[term.getOffe()] = null;
}
Term term = new Term(sb.toString(), offe, TermNatures.NR);
insertTermNum(terms, term);
}
protected static Term setToAndfrom(Term to, Term from) {
from.setTo(to);
to.setFrom(from);
return from;
}
private static final HashMap<String, int[]> companyMap = CompanyAttrLibrary.getCompanyMap();
/**
*
*
* @return null
*/
public static void parseNature(Term term) {
if (!Nature.NW.equals(term.natrue())) {
return;
}
String name = term.getName();
if (name.length() <= 3) {
return;
}
// 是否是外国人名
if (ForeignPersonRecognition.isFName(name)) {
term.setNature(NatureLibrary.getNature("nrf"));
return;
}
List<Term> subTerm = term.getSubTerm();
// 判断是否是机构名
term.setSubTerm(subTerm);
Term first = subTerm.get(0);
Term last = subTerm.get(subTerm.size() - 1);
int[] is = companyMap.get(first.getName());
int all = 0;
is = companyMap.get(last.getName());
if (is != null) {
all += is[1];
}
if (all > 1000) {
term.setNature(NatureLibrary.getNature("nt"));
return;
}
}
/**
* fromtosubterm
*
* @param terms
* @param from
* @param to
* @return
*/
public static List<Term> getSubTerm(Term from, Term to) {
List<Term> subTerm = new ArrayList<>(3);
while ((from = from.to()) != to) {
subTerm.add(from);
}
return subTerm;
}
}

View File

@ -1,83 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.deeplearning4j.nlp.chinese.tokenization.tokenizer;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
public class ChineseTokenizer implements Tokenizer {
private TokenPreProcess tokenPreProcess;
private List<Term> tokenList;
private Iterator<Term> tokenIter;
public ChineseTokenizer() {}
public ChineseTokenizer(String toTokenize) {
Result result = NlpAnalysis.parse(toTokenize);
this.tokenList = result.getTerms();
this.tokenIter = tokenList.iterator();
}
@Override
public boolean hasMoreTokens() {
return tokenIter.hasNext();
}
@Override
public int countTokens() {
return tokenList != null ? tokenList.size() : 0;
}
@Override
public String nextToken() {
if (!hasMoreTokens()) {
throw new NoSuchElementException();
}
return this.tokenPreProcess != null ? this.tokenPreProcess.preProcess(tokenIter.next().getName())
: tokenIter.next().getName();
}
@Override
public List<String> getTokens() {
ArrayList tokenList = new ArrayList();
while (hasMoreTokens()) {
tokenList.add(nextToken());
}
return tokenList;
}
@Override
public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
this.tokenPreProcess = tokenPreProcessor;
}
}

View File

@ -1,58 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.deeplearning4j.nlp.chinese.tokenization.tokenizerFactory;
import org.deeplearning4j.nlp.chinese.tokenization.tokenizer.ChineseTokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import java.io.InputStream;
public class ChineseTokenizerFactory implements TokenizerFactory {
private TokenPreProcess tokenPreProcess;
@Override
public Tokenizer create(String toTokenize) {
Tokenizer tokenizer = new ChineseTokenizer(toTokenize);
tokenizer.setTokenPreProcessor(tokenPreProcess);
return tokenizer;
}
@Override
public Tokenizer create(InputStream toTokenize) {
throw new UnsupportedOperationException();
/* Tokenizer t = new ChineseStreamTokenizer(toTokenize);
t.setTokenPreProcessor(tokenPreProcess);
return t;*/
}
@Override
public void setTokenPreProcessor(TokenPreProcess tokenPreProcess) {
this.tokenPreProcess = tokenPreProcess;
}
@Override
public TokenPreProcess getTokenPreProcessor() {
return tokenPreProcess;
}
}

View File

@ -1,201 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -1,8 +0,0 @@
ansj_seg
Copyright 2011-2016 ansj_seg
the deeplearning4j-nlp-chinese
Copyright 2017-2022 the deeplearning4j-nlp-chinese
This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).

View File

@ -1,105 +0,0 @@
a 4
b 4
c 4
d 4
e 4
f 4
g 4
h 4
i 4
j 4
k 4
l 4
m 4
n 4
o 4
p 4
q 4
r 4
s 4
t 4
u 4
v 4
w 4
x 4
y 4
z 4
A 4
B 4
C 4
D 4
E 4
F 4
G 4
H 4
I 4
J 4
K 4
L 4
M 4
N 4
O 4
P 4
Q 4
R 4
S 4
T 4
U 4
V 4
W 4
X 4
Y 4
Z 4
' 4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4

View File

@ -1,50 +0,0 @@
0 0 始##始 50610
1 1 末##末 0
2 2 a 34439
3 3 ad 5899
4 4 ag 311
5 5 an 2838
6 6 b 8734
7 7 bg 5
8 8 c 25473
9 9 d 47714
10 10 dg 126
11 11 e 26
12 12 f 17248
13 13 h 48
14 14 i 5001
15 15 j 10293
16 16 k 958
17 17 l 6055
18 18 m 41036
19 19 mg 6
20 20 n 237124
21 21 ng 4497
22 22 nr 20061
23 23 ns 27777
24 24 nt 3565
25 25 nx 459
26 26 nz 3728
27 27 o 70
28 28 p 39906
29 29 q 24236
30 30 r 32367
31 31 rg 10
32 32 s 3868
33 33 t 20646
34 34 tg 486
35 35 u 5194
36 36 ud 661
37 37 ug 449
38 38 uj 54477
39 39 ul 10234
40 40 uv 2121
41 41 uz 1664
42 42 v 184620
43 43 vd 493
44 44 vg 1866
45 45 vn 42615
46 46 w 173046
47 47 y 1892
48 48 yg 1
49 49 z 1315

View File

@ -1,50 +0,0 @@
0 0 648 172 11 17 245 0 2702 1653 8 11 204 4 197 694 0 443 2859 1 6233 23 3275 3416 828 25 263 3 5245 11 6857 0 177 3512 27 10 0 0 1 0 0 0 5230 4 54 440 5047 2 0 58
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 173 833 8 6 127 62 0 451 296 2 0 84 0 13 80 4 125 896 0 11004 139 53 121 2 2 10 0 296 258 94 0 17 45 4 152 75 4 7616 167 1129 14 928 3 11 2264 6481 357 0 33
0 0 72 35 0 0 1 0 1 79 0 0 3 0 5 2 0 19 4 0 3 0 0 0 0 0 0 0 124 0 5 0 0 1 1 0 0 0 0 0 0 0 5482 5 21 2 34 0 0 0
0 6 10 0 1 0 1 1 10 8 0 0 2 0 0 0 0 1 4 0 53 16 2 1 0 0 0 0 2 0 2 0 0 0 0 0 0 0 15 0 1 0 57 0 7 1 107 3 0 0
0 37 10 5 0 46 0 0 231 114 0 0 37 0 7 0 0 3 12 0 264 6 2 2 1 0 0 0 23 1 5 0 0 0 1 18 0 0 150 0 2 0 260 0 0 219 1375 7 0 0
0 3 98 1 1 61 146 0 42 17 0 0 78 0 4 52 0 47 69 0 5576 19 3 82 9 6 16 0 17 3 26 0 18 7 0 12 1 0 804 0 2 0 121 0 3 801 581 5 0 3
0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
0 1 997 182 6 142 292 0 102 1507 2 0 100 9 128 264 0 234 645 0 6529 29 316 1099 91 15 94 1 1411 10 1134 0 145 292 7 18 0 0 2 0 0 0 6181 12 44 1800 1604 0 0 28
0 5 6181 470 39 2 59 0 93 4042 17 0 46 0 440 16 1 294 740 1 307 19 21 41 4 3 2 4 3511 15 296 0 8 74 4 30 1 0 41 4 193 0 29677 33 359 34 517 8 0 62
0 0 40 0 2 0 0 0 0 2 0 0 0 0 0 0 0 1 6 0 2 0 0 0 0 0 0 0 3 0 1 0 0 5 0 0 0 0 0 0 0 0 60 0 3 0 1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 23 0 0 0
0 26 185 144 3 13 43 0 133 1216 2 0 78 0 79 32 1 81 672 0 1254 25 24 120 8 1 6 2 414 35 203 0 19 16 0 66 0 0 1852 2 2 1 4237 12 39 170 5992 21 0 19
0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 6 0 1 2 0 26 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 5 0 0 0 0
0 45 9 5 1 0 2 0 43 69 3 0 44 0 18 2 2 8 14 1 75 6 4 8 0 0 1 0 61 5 8 0 2 3 1 32 1 0 962 3 318 1 299 0 7 8 2896 31 0 3
0 26 113 42 0 5 281 0 194 280 0 0 244 0 10 1247 5 30 466 0 3195 16 14 124 6 1 19 0 248 6 82 0 44 69 4 101 0 0 379 1 0 0 1296 10 5 514 1213 1 0 2
0 1 27 11 1 0 6 0 15 151 0 0 1 0 21 0 0 10 5 0 40 1 1 1 0 1 1 0 96 0 14 0 10 12 0 3 0 0 87 0 0 0 314 1 1 2 121 2 0 1
0 41 65 23 1 8 5 0 161 183 0 0 95 0 12 15 0 25 53 0 605 13 15 16 1 0 1 0 100 1 44 0 8 13 0 61 1 1 896 3 86 1 581 4 7 175 2712 23 0 0
0 294 1243 19 12 50 143 1 115 327 0 0 236 1 60 81 3 68 2580 0 6960 282 27 222 0 6 54 1 223 21924 50 0 24 69 13 9 0 0 503 1 16 0 1941 0 28 369 3040 12 0 29
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0
0 1604 4111 1345 37 438 1116 0 8461 12529 34 1 7892 2 846 1164 840 892 4265 0 42974 660 5538 922 111 40 164 13 6366 71 1503 1 421 1377 24 2153 0 1 19021 2 121 3 34836 145 284 12625 61590 328 0 253
0 49 81 13 12 0 6 0 63 177 2 0 78 0 9 19 3 15 93 0 537 53 15 140 53 0 10 1 146 4 33 0 1 83 2 35 0 0 190 0 0 0 792 1 15 39 1708 14 0 5
0 836 113 66 0 0 69 0 455 852 1 0 38 0 57 10 3 45 243 0 2412 38 428 116 9 1 4 1 1173 2 124 0 46 618 1 429 1 0 651 0 0 0 3848 7 367 38 6936 9 0 14
0 77 293 102 2 28 243 0 504 701 0 0 448 1 33 565 5 138 839 0 10395 68 101 1677 122 6 595 0 582 10 252 0 235 1570 6 348 0 0 1632 0 0 0 2908 7 14 550 2704 9 0 7
0 4 16 10 0 8 177 0 71 81 0 0 28 0 3 99 0 9 39 0 993 9 8 813 36 0 0 0 151 0 23 0 3 50 0 19 0 0 112 0 0 0 394 0 1 76 332 0 0 0
0 0 7 1 0 0 4 0 3 7 0 0 4 0 1 0 20 0 10 0 170 2 1 0 0 0 2 0 6 6 1 0 1 0 0 4 0 0 23 0 0 0 19 0 0 16 151 0 0 0
0 7 34 5 0 1 31 0 40 37 0 0 27 1 4 28 4 10 49 0 1912 6 3 28 8 13 20 0 25 2 15 0 7 9 0 34 0 0 100 0 0 0 152 1 3 76 1034 0 0 2
0 1 2 0 0 0 0 0 0 2 0 1 0 0 0 0 0 0 4 0 7 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 10 0 7 1 14 0 0 1 17 0 0 0
0 4 1334 179 3 42 560 0 9 665 1 0 408 2 145 1036 1 305 2072 1 11794 109 836 4298 736 29 216 3 457 24 4177 3 771 2747 38 16 0 0 3 9 0 1 5230 10 35 867 679 0 0 51
0 95 1285 46 7 64 385 1 127 608 0 0 1204 2 102 276 0 139 699 0 7933 74 46 212 15 13 75 0 351 21 133 0 59 107 8 45 0 0 1122 3 26 0 2172 5 23 773 5875 46 0 59
0 23 966 195 13 42 178 0 313 3150 2 0 175 3 225 126 0 202 1934 0 6688 530 20 704 3 2 26 3 1664 1021 528 0 134 245 4 147 0 0 2249 1 5 1 8152 12 35 771 1753 60 0 62
0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 3 0 0 0
0 12 71 9 0 5 31 0 34 222 1 1 21 0 21 18 0 26 152 0 1104 14 1 22 0 1 6 4 35 4 34 0 18 4 2 16 0 0 433 0 0 0 752 3 2 138 635 5 0 11
0 90 203 97 3 3 33 0 104 927 0 0 1002 0 26 101 0 48 474 0 2936 244 74 332 16 5 25 0 1396 7 304 0 39 4576 125 84 0 0 1026 0 0 0 2751 3 13 219 3342 9 0 9
0 7 2 2 0 0 0 0 2 47 0 0 5 0 1 1 0 5 7 0 27 5 2 2 0 0 1 0 43 1 1 0 0 16 34 6 0 0 21 0 0 0 98 0 7 3 137 2 0 1
0 6 211 8 8 22 57 0 7 123 1 0 6 1 11 15 0 17 387 0 1646 425 0 26 2 0 2 0 54 13 73 0 15 8 30 8 0 0 87 0 11 0 1078 0 25 149 656 0 1 5
0 0 146 0 0 2 1 0 1 203 0 0 1 0 59 0 0 11 10 0 27 0 1 2 0 0 2 3 5 1 21 0 3 0 0 2 0 0 0 2 0 1 108 0 1 2 3 4 0 39
0 0 30 1 0 2 4 0 0 4 0 0 5 0 0 4 0 1 86 0 95 3 6 17 0 0 4 0 3 0 45 0 0 4 0 0 0 0 40 0 0 0 15 0 0 10 69 1 0 0
0 17 3740 48 10 765 1145 0 47 624 1 0 171 4 236 490 0 358 2988 0 27302 46 543 1211 118 45 230 3 64 34 656 0 227 379 8 9 0 0 0 0 0 0 1266 13 15 8491 3027 22 0 124
0 2 1003 21 5 26 179 0 8 349 2 0 31 0 69 122 0 82 1942 0 2626 43 149 494 30 9 45 0 270 59 664 0 81 207 3 3 0 0 19 0 0 0 785 5 5 496 370 6 0 24
0 0 20 9 0 0 0 0 1 29 0 0 0 0 10 0 0 8 6 0 4 0 0 0 0 0 0 1 199 0 4 0 0 0 0 0 0 0 0 0 0 0 1803 1 4 1 19 0 0 2
0 2 138 4 2 18 20 0 2 42 0 0 7 0 18 14 0 8 155 0 497 2 30 56 2 1 4 1 35 12 139 0 15 18 3 1 0 0 24 0 0 0 195 1 0 16 157 1 0 24
0 661 6913 851 69 741 1741 1 2092 4988 19 0 2445 13 875 1852 13 1079 10903 1 34362 1107 1298 5012 369 62 413 6 6621 515 7086 3 802 1841 53 625 576 438 10591 10006 110 1622 30472 109 183 4728 29379 813 0 161
0 0 10 15 0 0 0 0 0 28 0 0 1 0 0 0 0 1 0 0 4 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 2 1 0 0 408 2 0 2 16 0 0 0
0 6 58 1 6 7 14 0 13 28 0 0 8 0 9 36 2 4 86 0 433 54 10 65 2 0 3 1 65 4 58 0 4 56 4 10 1 3 18 13 0 12 214 0 5 21 523 6 0 3
0 262 286 108 0 57 134 0 2147 1160 0 0 1110 0 58 40 1 71 170 0 15841 109 9 38 0 3 6 0 317 2 89 0 51 42 2 344 0 0 2247 0 4 0 2683 13 8 3364 11806 25 0 8
0 46148 2796 1646 49 92 1316 1 6660 10173 28 11 875 5 1188 1786 49 1189 4355 1 21974 298 7184 6332 982 169 1407 16 8087 153 7581 2 459 2568 75 337 4 2 1244 15 4 6 26623 71 227 2329 6280 42 0 207
0 26 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 4 0 1 0 1837 15 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 12 35 0 1 4 2 0 16 10 0 0 5 0 1 0 0 2 41 0 299 2 1 4 0 0 0 0 14 0 1 0 4 3 1 7 0 0 303 1 83 0 179 0 4 9 262 3 0 6

View File

@ -1,996 +0,0 @@
公司 nt 883182.0
厂 nt 689589.0
部 nt 337479.0
中心 nt 159768.0
商行 nt 71428.0
办事处 nt 61678.0
加工厂 nt 57852.0
店 nt 52534.0
站 nt 50292.0
机械厂 nt 39491.0
集团 nt 35976.0
业 nt 32541.0
业务部 nt 31541.0
商店 nt 27311.0
行 nt 24684.0
处 nt 22266.0
门市部 nt 19569.0
总公司 nt 19119.0
研究所 nt 18541.0
经销处 nt 18359.0
工作室 nt 18259.0
加油站 nt 17740.0
基地 nt 17597.0
化工厂 nt 15048.0
经营 nt 14226.0
印刷厂 nt 13281.0
服务部 nt 12905.0
批发部 nt 12852.0
专卖店 nt 11758.0
事务所 nt 11576.0
食品厂 nt 11366.0
学校 nt 11126.0
场 nt 11015.0
合作社 nt 10824.0
制造厂 nt 10702.0
集团公司 nt 8663.0
代表处 nt 8250.0
所 nt 8187.0
商场 nt 8042.0
养殖场 nt 8034.0
队 nt 7689.0
学院 nt 7305.0
超市 nt 7261.0
修理厂 nt 6928.0
总代理 nt 6703.0
办 nt 6651.0
营业部 nt 6625.0
制衣厂 nt 6191.0
铸造厂 nt 6010.0
总汇 nt 5997.0
服装厂 nt 5863.0
总厂 nt 5703.0
分厂 nt 5341.0
实业 nt 5036.0
社 nt 4896.0
办公室 nt 4844.0
协会 nt 4722.0
坊 nt 4687.0
总部 nt 4621.0
药店 nt 4591.0
局 nt 4590.0
机构 nt 4569.0
酒厂 nt 4496.0
分店 nt 4248.0
门市 nt 3942.0
建材厂 nt 3931.0
大酒店 nt 3913.0
中学 nt 3849.0
俱乐部 nt 3721.0
旅行社 nt 3691.0
企业 nt 3655.0
鞋业 nt 3573.0
酒店 nt 3473.0
科技 nt 3367.0
服装店 nt 3275.0
工程部 nt 3231.0
分部 nt 3222.0
商务部 nt 3171.0
城 nt 3118.0
造纸厂 nt 3055.0
仪器厂 nt 3051.0
收购站 nt 3023.0
网 nt 3019.0
市场 nt 3018.0
部门 nt 2974.0
针织厂 nt 2954.0
修配厂 nt 2904.0
仪表厂 nt 2817.0
农场 nt 2786.0
股份公司 nt 2742.0
工厂 nt 2739.0
村委会 nt 2696.0
心 nt 2683.0
室 nt 2655.0
服务站 nt 2654.0
小学 nt 2448.0
研究院 nt 2409.0
专营店 nt 2386.0
信用社 nt 2340.0
饲料厂 nt 2326.0
饭店 nt 2303.0
书店 nt 2283.0
苗圃 nt 2267.0
管理所 nt 2258.0
科 nt 2252.0
分社 nt 2223.0
网吧 nt 2185.0
园 nt 2134.0
大学 nt 2075.0
点 nt 2067.0
林场 nt 2021.0
销售 nt 1991.0
纺织厂 nt 1973.0
支局 nt 1963.0
药房 nt 1922.0
管理局 nt 1907.0
药业 nt 1907.0
电厂 nt 1810.0
院 nt 1810.0
电子 nt 1790.0
连锁店 nt 1779.0
煤厂 nt 1705.0
布厂 nt 1650.0
馆 nt 1645.0
冶炼厂 nt 1603.0
粮站 nt 1600.0
纸厂 nt 1572.0
商城 nt 1565.0
支公司 nt 1563.0
制作厂 nt 1554.0
委员会 nt 1534.0
经销 nt 1530.0
司 nt 1445.0
批发 nt 1439.0
代理 nt 1426.0
营业厅 nt 1381.0
织造厂 nt 1380.0
精品店 nt 1376.0
煤矿 nt 1361.0
营业所 nt 1356.0
回收站 nt 1343.0
营部 nt 1335.0
招待所 nt 1319.0
兽医站 nt 1308.0
宾馆 nt 1301.0
百货公司 nt 1296.0
百货商店 nt 1291.0
种子公司 nt 1277.0
专卖 nt 1265.0
牧业 nt 1257.0
矿 nt 1200.0
商社 nt 1196.0
联络处 nt 1188.0
工程队 nt 1168.0
发行 nt 1164.0
酒楼 nt 1156.0
电器 nt 1155.0
医院 nt 1139.0
矿业 nt 1095.0
五金店 nt 1061.0
铺 nt 1046.0
贸易 nt 1033.0
用品 nt 1031.0
丝厂 nt 1029.0
供销社 nt 1024.0
粮店 nt 1009.0
幼儿园 nt 991.0
化工 nt 986.0
汽修厂 nt 968.0
礼品店 nt 966.0
分局 nt 961.0
旅馆 nt 950.0
维修 nt 913.0
管理处 nt 913.0
组 nt 901.0
商贸 nt 896.0
水厂 nt 895.0
广场 nt 871.0
餐厅 nt 865.0
财政所 nt 864.0
处理厂 nt 845.0
卫生室 nt 829.0
屋 nt 805.0
服饰 nt 775.0
邮政局 nt 749.0
机械 nt 748.0
玩具 nt 737.0
伟业 nt 731.0
生产厂 nt 713.0
总店 nt 701.0
家电 nt 692.0
系 nt 684.0
农业局 nt 679.0
食品店 nt 675.0
货运 nt 671.0
分站 nt 669.0
百货店 nt 656.0
服部 nt 645.0
店铺 nt 629.0
设备 nt 625.0
开发部 nt 620.0
总站 nt 605.0
轧钢厂 nt 596.0
装饰 nt 596.0
设计院 nt 589.0
个体 nt 588.0
业务 nt 570.0
电信局 nt 565.0
通讯 nt 563.0
新华书店 nt 562.0
大队 nt 559.0
食杂店 nt 558.0
广告 nt 555.0
推广站 nt 553.0
棉纺厂 nt 550.0
世界 nt 544.0
大厦 nt 538.0
生产队 nt 537.0
电器行 nt 531.0
分场 nt 530.0
玩具店 nt 528.0
经办 nt 526.0
卫生站 nt 519.0
发电厂 nt 518.0
农药厂 nt 517.0
干洗店 nt 516.0
配件 nt 516.0
设计 nt 512.0
花店 nt 511.0
研究室 nt 511.0
库 nt 510.0
加工 nt 509.0
分行 nt 505.0
储备库 nt 501.0
车间 nt 499.0
化妆品 nt 493.0
粮库 nt 492.0
肉联厂 nt 483.0
内贸部 nt 480.0
文具店 nt 479.0
服务 nt 476.0
株式会社 nt 476.0
阁 nt 461.0
材料 nt 457.0
支行 nt 445.0
代销店 nt 442.0
作坊 nt 441.0
淀粉厂 nt 440.0
经营户 nt 438.0
服务队 nt 436.0
杂志社 nt 420.0
实验室 nt 420.0
维修厂 nt 419.0
商铺 nt 419.0
服务处 nt 416.0
厅 nt 414.0
厂家 nt 403.0
度假村 nt 397.0
农业 nt 394.0
电视台 nt 394.0
传媒 nt 393.0
金店 nt 391.0
出版社 nt 385.0
杂货店 nt 385.0
建筑队 nt 380.0
联盟 nt 379.0
电脑 nt 372.0
指挥部 nt 370.0
涂料 nt 367.0
工业 nt 360.0
杂货铺 nt 357.0
副食店 nt 357.0
会 nt 355.0
油坊 nt 354.0
堂 nt 353.0
仓库 nt 353.0
时装店 nt 353.0
网络 nt 351.0
炼油厂 nt 351.0
茶场 nt 347.0
回收 nt 345.0
高级中学 nt 344.0
有限 nt 343.0
热电厂 nt 341.0
工区 nt 339.0
经销商 nt 338.0
介绍所 nt 335.0
代理商 nt 333.0
印染厂 nt 331.0
检疫站 nt 331.0
铁矿 nt 328.0
家具 nt 324.0
加盟店 nt 323.0
银行 nt 321.0
糖厂 nt 321.0
连锁 nt 317.0
物业 nt 315.0
子公司 nt 312.0
工会 nt 301.0
酒家 nt 300.0
楼 nt 297.0
军 nt 295.0
军区 nt 1.0
贸 nt 294.0
器材 nt 294.0
工程 nt 292.0
太阳能 nt 292.0
旅社 nt 289.0
饰品 nt 289.0
种植园 nt 286.0
置业 nt 286.0
制品 nt 282.0
煤场 nt 279.0
良种场 nt 277.0
销售点 nt 276.0
国际 nt 276.0
洗衣店 nt 276.0
停车场 nt 275.0
棉织厂 nt 274.0
销售科 nt 274.0
药厂 nt 274.0
美容院 nt 273.0
化工部 nt 273.0
摄影 nt 272.0
油漆厂 nt 270.0
采购站 nt 269.0
商厦 nt 266.0
建材 nt 266.0
分校 nt 265.0
农庄 nt 264.0
灯饰 nt 264.0
理发店 nt 263.0
苑 nt 262.0
吧 nt 260.0
数码 nt 258.0
商 nt 256.0
百货 nt 256.0
材料部 nt 256.0
鞋行 nt 255.0
铝厂 nt 251.0
旅店 nt 251.0
商务 nt 249.0
工学院 nt 248.0
无限公司 nt 247.0
造船厂 nt 246.0
分理处 nt 245.0
园区 nt 243.0
五金 nt 240.0
印刷 nt 240.0
分中心 nt 240.0
礼品 nt 238.0
油库 nt 237.0
培训部 nt 237.0
庄园 nt 236.0
专科学校 nt 230.0
农技站 nt 228.0
会馆 nt 228.0
饮食店 nt 225.0
师范学院 nt 222.0
渔场 nt 222.0
修理店 nt 222.0
公寓 nt 221.0
服装 nt 220.0
食品 nt 220.0
居 nt 219.0
售票处 nt 218.0
运输队 nt 218.0
音响 nt 218.0
经营者 nt 217.0
收费站 nt 216.0
零售店 nt 216.0
货栈 nt 216.0
专柜 nt 215.0
大全 nt 215.0
培训 nt 215.0
镇政府 nt 214.0
养鸡场 nt 214.0
林 nt 211.0
邮电所 nt 211.0
联合会 nt 211.0
润滑油 nt 210.0
联社 nt 209.0
商会 nt 209.0
教育 nt 206.0
发网 nt 206.0
转运站 nt 202.0
化学 nt 201.0
照相馆 nt 201.0
分会 nt 200.0
山庄 nt 199.0
纺 nt 199.0
工艺 nt 198.0
号 nt 197.0
礼品部 nt 197.0
包装 nt 196.0
工艺品 nt 196.0
师范大学 nt 193.0
研究会 nt 193.0
公证处 nt 190.0
学会 nt 189.0
家具城 nt 188.0
百货大楼 nt 187.0
工贸 nt 187.0
兽药厂 nt 185.0
轮胎 nt 183.0
照明 nt 183.0
养猪场 nt 182.0
汇 nt 182.0
珠宝店 nt 180.0
通信 nt 178.0
车站 nt 178.0
科学院 nt 178.0
咨询 nt 177.0
制作 nt 176.0
信息网 nt 176.0
养殖 nt 175.0
软件 nt 175.0
科研所 nt 175.0
食堂 nt 174.0
变电站 nt 174.0
示范园 nt 173.0
轩 nt 173.0
繁殖场 nt 173.0
班 nt 172.0
工商户 nt 171.0
自选商场 nt 170.0
大楼 nt 170.0
机电 nt 167.0
经理 nt 166.0
团 nt 166.0
医药 nt 166.0
个体户 nt 165.0
养蜂场 nt 164.0
管委会 nt 162.0
猪场 nt 162.0
供电局 nt 162.0
营业 nt 161.0
本部 nt 161.0
车队 nt 161.0
果园 nt 160.0
制造 nt 159.0
沙龙 nt 159.0
人民政府 nt 159.0
体校 nt 158.0
快餐店 nt 158.0
个人 nt 157.0
经销点 nt 154.0
油公司 nt 154.0
茶坊 nt 154.0
纱厂 nt 153.0
浴池 nt 153.0
交易所 nt 152.0
产品 nt 150.0
厂部 nt 149.0
技术学校 nt 148.0
学生 nt 147.0
检查站 nt 146.0
医学院 nt 146.0
在线 nt 145.0
医务室 nt 145.0
站台 nt 144.0
美容 nt 142.0
小吃店 nt 142.0
校 nt 142.0
中转站 nt 141.0
租赁 nt 141.0
电子部 nt 141.0
果场 nt 141.0
金行 nt 141.0
技术 nt 139.0
货场 nt 139.0
外贸 nt 136.0
采购 nt 136.0
茶店 nt 135.0
书屋 nt 135.0
驾校 nt 135.0
烤鸭店 nt 134.0
客运站 nt 133.0
营销 nt 132.0
代办处 nt 132.0
行业 nt 131.0
冷库 nt 130.0
饭庄 nt 130.0
小卖部 nt 130.0
物资部 nt 129.0
管理 nt 129.0
试验场 nt 129.0
平台 nt 128.0
商贸城 nt 128.0
完小 nt 128.0
孵化场 nt 128.0
人事部 nt 127.0
电气 nt 126.0
屠宰场 nt 126.0
修理 nt 125.0
精品屋 nt 125.0
内部 nt 125.0
专营 nt 125.0
渔业 nt 124.0
园艺 nt 123.0
联营厂 nt 123.0
牧场 nt 123.0
艺术团 nt 122.0
开发 nt 121.0
商学院 nt 120.0
工务段 nt 120.0
陶瓷 nt 120.0
洗染店 nt 120.0
模具 nt 119.0
策划 nt 118.0
初级中学 nt 118.0
日化 nt 118.0
供应 nt 117.0
中专 nt 117.0
促进会 nt 117.0
拍卖行 nt 116.0
编辑部 nt 116.0
小组 nt 116.0
示范场 nt 115.0
商业 nt 115.0
餐饮部 nt 115.0
采油厂 nt 114.0
师范学校 nt 113.0
诊所 nt 111.0
石化 nt 111.0
总会 nt 111.0
斋 nt 110.0
火柴厂 nt 110.0
工具 nt 110.0
汽修 nt 110.0
面包房 nt 110.0
纺织 nt 109.0
运输 nt 109.0
机电部 nt 108.0
组委会 nt 108.0
采石场 nt 107.0
布艺 nt 107.0
精品 nt 107.0
公路局 nt 107.0
信息 nt 106.0
支队 nt 106.0
布店 nt 104.0
团队 nt 104.0
供应商 nt 103.0
中心校 nt 102.0
乐园 nt 101.0
石材 nt 101.0
茶叶 nt 101.0
车务段 nt 101.0
邮电局 nt 100.0
农资 nt 100.0
石油城 nt 99.0
出租 nt 99.0
餐馆 nt 98.0
网站 nt 98.0
门诊 nt 98.0
鸡场 nt 98.0
舍 nt 97.0
乐器 nt 94.0
宣传部 nt 94.0
股份 nt 93.0
代表 nt 92.0
北京 nt 92.0
系统 nt 91.0
铺子 nt 91.0
图书馆 nt 91.0
名称 nt 91.0
缫丝厂 nt 91.0
职业中学 nt 91.0
服务所 nt 91.0
供应科 nt 90.0
汽车 nt 90.0
制药 nt 90.0
光电 nt 89.0
花园 nt 89.0
工场 nt 89.0
购物 nt 88.0
仪器 nt 88.0
畜牧场 nt 87.0
教研室 nt 87.0
寻呼台 nt 87.0
房地产 nt 87.0
电台 nt 86.0
种畜场 nt 86.0
粮食局 nt 85.0
家园 nt 85.0
商务处 nt 85.0
沙场 nt 85.0
苗木 nt 85.0
热水器 nt 84.0
支店 nt 84.0
装潢 nt 84.0
自动化 nt 83.0
货运站 nt 83.0
理工学院 nt 83.0
家私 nt 83.0
汽车站 nt 83.0
零售 nt 83.0
批发商 nt 83.0
食品部 nt 82.0
门诊部 nt 82.0
铜矿 nt 82.0
报社 nt 81.0
机务段 nt 81.0
鹿场 nt 80.0
麻纺厂 nt 80.0
发行部 nt 80.0
基业 nt 80.0
加盟 nt 80.0
传播 nt 80.0
服装城 nt 80.0
画室 nt 79.0
塑料 nt 79.0
林业 nt 79.0
小家电 nt 79.0
歌舞厅 nt 78.0
珠宝 nt 78.0
钻井队 nt 78.0
产业 nt 77.0
服务网 nt 77.0
商业城 nt 77.0
耗材 nt 77.0
艺术馆 nt 76.0
酒吧 nt 76.0
沥青厂 nt 76.0
展览会 nt 75.0
供应点 nt 75.0
摊床 nt 74.0
二手车 nt 74.0
技校 nt 74.0
电讯 nt 74.0
生产 nt 74.0
变电所 nt 73.0
电梯 nt 73.0
植保站 nt 70.0
农经站 nt 70.0
盐场 nt 70.0
监测站 nt 70.0
钟表店 nt 68.0
彩印 nt 68.0
小学校 nt 66.0
招生办 nt 66.0
网点 nt 66.0
安装 nt 66.0
基金会 nt 66.0
水电站 nt 65.0
课题组 nt 15.0
游戏厅 nt 6.0
航空港 nt 3.0
师部 nt 1.0
农校 nt 1.0
地质队 nt 1.0
镇 ns 13727.0
乡 ns 12503.0
街道 ns 4309.0
村 ns 3266.0
社区 ns 2100.0
县 ns 1417.0
胡同 ns 882.0
区 ns 834.0
市 ns 308.0
城镇 ns 298.0
山乡 ns 295.0
苏木 ns 258.0
居委会 ns 231.0
村镇 ns 205.0
道 ns 187.0
集镇 ns 187.0
开发区 ns 181.0
市镇 ns 137.0
自治县 ns 131.0
家乡 ns 102.0
地区 ns 91.0
城乡 ns 81.0
山区 ns 75.0
城区 ns 61.0
旗 ns 55.0
州 ns 55.0
水乡 ns 47.0
东乡 ns 37.0
街 ns 36.0
山村 ns 35.0
监狱 ns 33.0
自治州 ns 30.0
营 ns 29.0
管理区 ns 28.0
群岛 ns 27.0
水库 ns 21.0
北乡 ns 21.0
乡镇 ns 20.0
桥 ns 18.0
南县 ns 18.0
新区 ns 17.0
古镇 ns 17.0
民族乡 ns 17.0
工业区 ns 16.0
下乡 ns 16.0
竹乡 ns 16.0
丰县 ns 16.0
矿区 ns 15.0
湖 ns 14.0
塔 ns 13.0
东区 ns 13.0
兴县 ns 12.0
果乡 ns 11.0
西村 ns 11.0
巷 ns 11.0
湾 ns 11.0
市辖区 ns 10.0
南区 ns 10.0
家委会 ns 10.0
庄 ns 10.0
亭 ns 10.0
塘 ns 9.0
家村 ns 9.0
泉 ns 9.0
市区 ns 9.0
庵 ns 9.0
堡 ns 8.0
劳教所 nt 8.0
郊区 ns 8.0
老乡 ns 8.0
坝 ns 8.0
王庄村 ns 8.0
城市 ns 8.0
村村 ns 7.0
宁乡 ns 7.0
沟 ns 7.0
海区 ns 7.0
浦 ns 7.0
风景区 ns 7.0
潭 ns 7.0
官庄村 ns 7.0
虚拟 ns 7.0
邑 ns 6.0
小区 ns 6.0
关 ns 6.0
西区 ns 6.0
花乡 ns 6.0
房 ns 6.0
卡 ns 6.0
定 ns 6.0
岗 ns 6.0
直镇 ns 6.0
林区 ns 6.0
林县 ns 6.0
辖 ns 6.0
特区 ns 6.0
冈 ns 5.0
岗区 ns 5.0
辛店村 ns 5.0
管区 ns 5.0
达县 ns 5.0
寨 ns 5.0
新县 ns 5.0
谷 ns 5.0
农科所 nt 5.0
岭 ns 5.0
自治区 ns 5.0
夹道 ns 5.0
滩 ns 5.0
坡 ns 5.0
坪 ns 5.0
新村 ns 5.0
大江 ns 4.0
桐乡 ns 4.0
栅栏 ns 4.0
全镇 ns 4.0
神庙 ns 4.0
里庄村 ns 4.0
自然保护区 ns 4.0
沙洲 ns 4.0
同乡 ns 4.0
依达乡 ns 4.0
巴县 ns 4.0
洲 ns 4.0
官庄镇 ns 4.0
路 ns 4.0
溪口镇 ns 3.0
垦区 ns 3.0
乌镇 ns 3.0
水井 ns 3.0
景区 ns 3.0
回族 ns 3.0
马桥镇 ns 3.0
公园 ns 3.0
回乡 ns 3.0
营区 ns 3.0
名胜区 ns 3.0
刘庄村 ns 3.0
辛店镇 ns 3.0
本乡 ns 3.0
西亚 ns 3.0
竹园镇 ns 3.0
高村 ns 3.0
北河乡 ns 2.0
萨摩亚 ns 2.0
竹林镇 ns 2.0
瑶乡 ns 2.0
拉西乡 ns 2.0
张庄村 ns 2.0
柏林 ns 2.0
大门 ns 2.0
示范区 ns 2.0
渔乡 ns 2.0
联邦 ns 2.0
马桩 ns 2.0
卡拉 ns 2.0
站区 ns 2.0
本镇 ns 2.0
圣庙 ns 2.0
大街 ns 2.0
共和国 ns 2.0
宽街 ns 2.0
太平村 ns 2.0
开县 ns 2.0
庙街 ns 2.0
杨村 ns 2.0
苏州 ns 2.0
西庄村 ns 2.0
重镇 ns 2.0
农区 ns 2.0
水口镇 ns 2.0
岸区 ns 2.0
西沟村 ns 2.0
官园 ns 2.0
菜园 ns 2.0
开发办 ns 2.0
保税区 ns 2.0
试验区 ns 2.0
桃园 ns 2.0
文县 ns 2.0
全县 ns 2.0
岔河镇 ns 2.0
宿县 ns 2.0
易县 ns 1.0
洋县 ns 1.0
华里 ns 1.0
阿图什 ns 1.0
城西乡 ns 1.0
布市 ns 1.0
天池 ns 1.0
坎市 ns 1.0
钓鱼台 ns 1.0
海淀区 ns 1.0
通州区 ns 1.0
土沟村 ns 1.0
文昌阁 ns 1.0
聂庄村 ns 1.0
西城区 ns 1.0
密云县 ns 1.0
唐庄镇 ns 1.0
返乡 ns 1.0
炒面 ns 1.0
黄村 ns 1.0
吉祥村 ns 1.0
行政区 ns 1.0
塘沽区 ns 1.0
市直 ns 1.0
邱县 ns 1.0
农村 ns 1.0
海域 ns 1.0
沙湾镇 ns 1.0
南里 ns 1.0
花市 ns 1.0
渠县 ns 1.0
滦县 ns 1.0
并入 ns 1.0
威县 ns 1.0
后河乡 ns 1.0
晋安区 ns 1.0
酋长国 ns 1.0
城口县 ns 1.0
渡口 ns 1.0
思乡 ns 1.0
达科他州 ns 1.0
西青区 ns 1.0
新罗区 ns 1.0
江北区 ns 1.0
宣武区 ns 1.0
徐汇区 ns 1.0
茅山 ns 1.0
松岗镇 ns 1.0
大河乡 ns 1.0
筒子 ns 1.0
黄浦区 ns 1.0
门头沟区 ns 1.0
石门镇 ns 1.0
呼和浩特 ns 1.0
寺沟乡 ns 1.0
塘桥镇 ns 1.0
太平镇 ns 1.0
渔港 ns 1.0
上坡 ns 1.0
马銮湾 ns 1.0
营房 ns 1.0
边区 ns 1.0
比尔 ns 1.0
蓟县 ns 1.0
东江镇 ns 1.0
石景山区 ns 1.0
太仓 ns 1.0
官厅 ns 1.0
市郊 ns 1.0
伦敦 ns 1.0
津南区 ns 1.0
小镇 ns 1.0
西固区 ns 1.0
四平乡 ns 1.0
水头乡 ns 1.0
马普托 ns 1.0
场区 ns 1.0
闵行区 ns 1.0
龙头乡 ns 1.0
港口 ns 1.0
长宁区 ns 1.0
北辰区 ns 1.0
梅山镇 ns 1.0
仓山区 ns 1.0
澳洲 ns 1.0
萍乡 ns 1.0
嘉定区 ns 1.0
区域 ns 1.0
沧县 ns 1.0
卡子 ns 1.0
河西区 ns 1.0
渝中区 ns 1.0
柳行镇 ns 1.0
大湖镇 ns 1.0
达拉特旗 ns 1.0
后身 ns 1.0
灌区 ns 1.0
红桥区 ns 1.0
西伯利亚 ns 1.0
南开区 ns 1.0
贸易区 ns 1.0
村委 ns 1.0
茂南区 ns 1.0
常山县 ns 1.0
海南 ns 1.0
草场 ns 1.0
河东区 ns 1.0
常山 ns 1.0
坪坝 ns 1.0
口岸 ns 1.0
大栅栏 ns 1.0
草坪 ns 1.0
安达 ns 1.0
锦旗 ns 1.0
黄县 ns 1.0
泰州市 ns 1.0
东城区 ns 1.0
南市 ns 1.0
河流镇 ns 1.0
宁河县 ns 1.0
亚尔乡 ns 1.0
奉节县 ns 1.0
道口 ns 1.0
鼓楼区 ns 1.0
巫山县 ns 1.0
和平区 ns 1.0
延庆县 ns 1.0
小街 ns 1.0
海岸 ns 1.0
屯河 ns 1.0
丰台区 ns 1.0
杨浦区 ns 1.0
梁平县 ns 1.0
苗乡 ns 1.0
普陀区 ns 1.0
南园 ns 1.0
义县 ns 1.0
长安镇 ns 1.0
大足县 ns 1.0
管教所 nt 1.0
鸽镇 ns 1.0
朝阳区 ns 1.0
东兴 ns 1.0
大营子镇 ns 1.0
石桥 ns 1.0
泰州 ns 1.0
呼和浩特市 ns 1.0
运河 ns 1.0
白旗 ns 1.0
长岭 ns 1.0
水泡 ns 1.0
泥河镇 ns 1.0
沁县 ns 1.0
河北区 ns 1.0
胡兰镇 ns 1.0
寿县 ns 1.0
岛礁 ns 1.0
崇明县 ns 1.0
忠县 ns 1.0
南街 ns 1.0
达斡尔族 ns 1.0

View File

@ -1,51 +0,0 @@
0 5
1 5
2 5
3 5
4 5
5 5
6 5
7 5
8 5
9 5
5
5
5
5
5
5
5
5
5
5
% 5
零 5
一 5
二 5
三 5
四 5
五 5
六 5
七 5
八 5
九 5
十 5
百 5
千 5
万 5
亿 5
兆 5
零 5
壹 5
贰 5
叁 5
肆 5
伍 5
陆 5
柒 5
捌 5
玖 5
拾 5
佰 5
仟 5
5

View File

@ -1,48 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.deeplearning4j.text.tokenization.tokenizer;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
import org.deeplearning4j.BaseDL4JTest;
import org.nd4j.common.tests.AbstractAssertTestsClass;
@Slf4j
public class AssertTestsExtendBaseClass extends AbstractAssertTestsClass {
@Override
protected Set<Class<?>> getExclusions() {
Set<Class<?>> exclusions = new HashSet<>();
return exclusions;
}
@Override
protected String getPackageName() {
return "org.deeplearning4j";
}
@Override
protected Class<?> getBaseClass() {
return BaseDL4JTest.class;
}
}

View File

@ -1,79 +0,0 @@
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.deeplearning4j.text.tokenization.tokenizer;
import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.BaseDL4JTest;
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.nlp.chinese.tokenization.tokenizerFactory.ChineseTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
@Slf4j
public class ChineseTokenizerTest extends BaseDL4JTest {
private final String toTokenize = "青山绿水和伟大的科学家让世界更美好和平";
private final String[] expect = {"青山绿水", "和", "伟大", "的", "科学家", "让", "世界", "更", "美好", "和平"};
@Test
public void testChineseTokenizer() {
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
}
//Train model by some data of the chinese names,Then find out the names from the dataset
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");
log.info("load is right!");
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());
//Generates a word-vector from the dataset stored in resources folder
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
.learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
vec.fit();
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));
//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
// WordVectors wordVectors;
//test model,Whether the model find out name from unknow text;
}
}

View File

@ -1,78 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ /* ******************************************************************************
~ *
~ *
~ * This program and the accompanying materials are made available under the
~ * terms of the Apache License, Version 2.0 which is available at
~ * https://www.apache.org/licenses/LICENSE-2.0.
~ *
~ * See the NOTICE file distributed with this work for additional
~ * information regarding copyright ownership.
~ * Unless required by applicable law or agreed to in writing, software
~ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
~ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
~ * License for the specific language governing permissions and limitations
~ * under the License.
~ *
~ * SPDX-License-Identifier: Apache-2.0
~ ******************************************************************************/
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp-parent</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>deeplearning4j-nlp-japanese</artifactId>
<properties>
<kuromoji.version>0.9.0</kuromoji.version>
<randomizedtesting.version>2.1.16</randomizedtesting.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<!--<dependency>-->
<!--<groupId>com.atilika.kuromoji</groupId>-->
<!--<artifactId>kuromoji-ipadic</artifactId>-->
<!--<version>${kuromoji.version}</version>-->
<!--<type>jar</type>-->
<!--<scope>compile</scope>-->
<!--</dependency>-->
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>randomizedtesting-runner</artifactId>
<version>${randomizedtesting.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
</dependencies>
<profiles>
<profile>
<id>test-nd4j-native</id>
</profile>
<profile>
<id>test-nd4j-cuda-11.0</id>
</profile>
</profiles>
</project>

Some files were not shown because too many files have changed in this diff Show More