Copyright updates, removal of extra nlp modules
parent
8bc3172e40
commit
1eaee7f6d9
|
@ -1,20 +1,21 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* * Copyright (c) 2021 Deeplearning4j Contributors
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
/* ******************************************************************************
|
||||
*
|
||||
*
|
||||
* This program and the accompanying materials are made available under the
|
||||
* terms of the Apache License, Version 2.0 which is available at
|
||||
* https://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* See the NOTICE file distributed with this work for additional
|
||||
* information regarding copyright ownership.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
******************************************************************************/
|
||||
|
||||
#!groovy
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -163,6 +163,12 @@
|
|||
<artifactId>oshi-core</artifactId>
|
||||
<version>${oshi.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.nd4j</groupId>
|
||||
<artifactId>nd4j-native</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
|
|
|
@ -64,7 +64,7 @@ public class CNN1DGradientCheckTest extends BaseDL4JTest {
|
|||
|
||||
@Override
|
||||
public long getTimeoutMilliseconds() {
|
||||
return 90000L;
|
||||
return 180000;
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -1,70 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ /* ******************************************************************************
|
||||
~ *
|
||||
~ *
|
||||
~ * This program and the accompanying materials are made available under the
|
||||
~ * terms of the Apache License, Version 2.0 which is available at
|
||||
~ * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
~ *
|
||||
~ * See the NOTICE file distributed with this work for additional
|
||||
~ * information regarding copyright ownership.
|
||||
~ * Unless required by applicable law or agreed to in writing, software
|
||||
~ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
~ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
~ * License for the specific language governing permissions and limitations
|
||||
~ * under the License.
|
||||
~ *
|
||||
~ * SPDX-License-Identifier: Apache-2.0
|
||||
~ ******************************************************************************/
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-nlp-parent</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-nlp-chinese</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<slf4j-api.version>1.6.4</slf4j-api.version>
|
||||
<logback-classic.version>0.9.28</logback-classic.version>
|
||||
<nlp-lang.version>1.7.2</nlp-lang.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-nlp</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.nlpcn</groupId>
|
||||
<artifactId>nlp-lang</artifactId>
|
||||
<version>${nlp-lang.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>test-nd4j-native</id>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>test-nd4j-cuda-11.0</id>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
|
@ -1,251 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf;
|
||||
|
||||
import org.ansj.app.crf.pojo.Element;
|
||||
import org.nlpcn.commons.lang.util.WordAlert;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class Config {
|
||||
|
||||
public String splitStr = "\\s+";
|
||||
|
||||
public Config(int[][] template) {
|
||||
this.template = template;
|
||||
}
|
||||
|
||||
public static final int TAG_NUM = 4; // 标记类型写死了4个
|
||||
|
||||
// 特殊字符的标注
|
||||
public static final char BEGIN = 128;
|
||||
|
||||
public static final char END = 129;
|
||||
|
||||
public static final char NUM_BEGIN = 130;
|
||||
|
||||
public static final char EN_BEGIN = 140;
|
||||
|
||||
public static final char FEATURE_BEGIN = 150;
|
||||
|
||||
public static char getNum(String str) {
|
||||
if (str.length() > 9) {
|
||||
return NUM_BEGIN;
|
||||
} else {
|
||||
return (char) (NUM_BEGIN + str.length());
|
||||
}
|
||||
}
|
||||
|
||||
public static char getEn(String str) {
|
||||
if (str.length() > 9) {
|
||||
return EN_BEGIN;
|
||||
} else {
|
||||
return (char) (EN_BEGIN + str.length());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 字标注类型
|
||||
public static int S = 0;
|
||||
public static int B = 1;
|
||||
public static int M = 2;
|
||||
public static int E = 3;
|
||||
|
||||
private int[][] template = {{-2}, {-1}, {0}, {1}, {2}, {-2, -1}, {-1, 0}, {0, 1}, {1, 2}, {-1, 1}};
|
||||
|
||||
public int[][] getTemplate() {
|
||||
return template;
|
||||
}
|
||||
|
||||
public void setTemplate(int[][] template) {
|
||||
this.template = template;
|
||||
}
|
||||
|
||||
/**
|
||||
* 词语标准化
|
||||
*
|
||||
* @param word
|
||||
* @return
|
||||
*/
|
||||
public static List<Element> wordAlert(String word) {
|
||||
|
||||
char[] chars = WordAlert.alertStr(word);
|
||||
|
||||
List<Element> list = new ArrayList<>();
|
||||
|
||||
StringBuilder tempSb = new StringBuilder();
|
||||
|
||||
int status = 0; // 1 num 2 english
|
||||
|
||||
Element element = null;
|
||||
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
|
||||
if (chars[i] >= '0' && chars[i] <= '9') {
|
||||
if (status == 2) {
|
||||
element = new Element(Config.getNum(tempSb.toString()));
|
||||
element.len = tempSb.length();
|
||||
list.add(element);
|
||||
tempSb = new StringBuilder();
|
||||
}
|
||||
tempSb.append(chars[i]);
|
||||
status = 1;
|
||||
} else if (chars[i] >= 'A' && chars[i] <= 'z') {
|
||||
if (status == 1) {
|
||||
element = new Element(Config.getEn(tempSb.toString()));
|
||||
element.len = tempSb.length();
|
||||
list.add(element);
|
||||
tempSb = new StringBuilder();
|
||||
}
|
||||
tempSb.append(chars[i]);
|
||||
status = 2;
|
||||
} else {
|
||||
if (status == 1) {
|
||||
element = new Element(Config.getNum(tempSb.toString()));
|
||||
element.len = tempSb.length();
|
||||
list.add(element);
|
||||
} else if (status == 2) {
|
||||
element = new Element(Config.getEn(tempSb.toString()));
|
||||
element.len = tempSb.length();
|
||||
list.add(element);
|
||||
}
|
||||
tempSb = new StringBuilder();
|
||||
list.add(new Element(chars[i]));
|
||||
status = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (tempSb.length() > 0) {
|
||||
if (status == 1) {
|
||||
element = new Element(Config.getNum(tempSb.toString()));
|
||||
element.len = tempSb.length();
|
||||
list.add(element);
|
||||
} else if (status == 2) {
|
||||
element = new Element(Config.getEn(tempSb.toString()));
|
||||
element.len = tempSb.length();
|
||||
list.add(element);
|
||||
} else {
|
||||
System.out.println("err!");
|
||||
}
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param temp
|
||||
* @return
|
||||
*/
|
||||
public static List<Element> makeToElementList(String temp, String splitStr) {
|
||||
String[] split = temp.split(splitStr);
|
||||
List<Element> list = new ArrayList<>(temp.length());
|
||||
|
||||
for (String word : split) {
|
||||
|
||||
List<Element> wordAlert = wordAlert(word);
|
||||
|
||||
int len = wordAlert.size();
|
||||
|
||||
if (len == 1) {
|
||||
wordAlert.get(0).updateTag(Config.S);
|
||||
} else if (len == 2) {
|
||||
wordAlert.get(0).updateTag(Config.B);
|
||||
wordAlert.get(1).updateTag(Config.E);
|
||||
} else if (len > 2) {
|
||||
wordAlert.get(0).updateTag(Config.B);
|
||||
for (int i = 1; i < len - 1; i++) {
|
||||
wordAlert.get(i).updateTag(Config.M);
|
||||
}
|
||||
wordAlert.get(len - 1).updateTag(Config.E);
|
||||
}
|
||||
|
||||
list.addAll(wordAlert);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<Element> makeToElementList(String temp) {
|
||||
return wordAlert(temp);
|
||||
}
|
||||
|
||||
public char getNameIfOutArr(List<Element> list, int index) {
|
||||
if (index < 0) {
|
||||
return Config.BEGIN;
|
||||
} else if (index >= list.size()) {
|
||||
return Config.END;
|
||||
} else {
|
||||
return list.get(index).name;
|
||||
}
|
||||
}
|
||||
|
||||
public char getTagIfOutArr(List<Element> list, int index) {
|
||||
if (index < 0 || index >= list.size()) {
|
||||
return 0;
|
||||
} else {
|
||||
return (char) list.get(index).getTag();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 得到一个位置的所有特征
|
||||
*
|
||||
* @param list
|
||||
* @param index
|
||||
* @return KeyValue(词语,featureLength*tagNum)
|
||||
*/
|
||||
public char[][] makeFeatureArr(List<Element> list, int index) {
|
||||
char[][] result = new char[template.length][];
|
||||
char[] chars = null;
|
||||
int len = 0;
|
||||
int i = 0;
|
||||
for (; i < template.length; i++) {
|
||||
if (template[i].length == 0) {
|
||||
continue;
|
||||
}
|
||||
chars = new char[template[i].length + 1];
|
||||
len = chars.length - 1;
|
||||
for (int j = 0; j < len; j++) {
|
||||
chars[j] = getNameIfOutArr(list, index + template[i][j]);
|
||||
}
|
||||
chars[len] = (char) (FEATURE_BEGIN + i);
|
||||
result[i] = chars;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static char getTagName(int tag) {
|
||||
switch (tag) {
|
||||
case 0:
|
||||
return 'S';
|
||||
case 1:
|
||||
return 'B';
|
||||
case 2:
|
||||
return 'M';
|
||||
case 3:
|
||||
return 'E';
|
||||
default:
|
||||
return '?';
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,81 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf;
|
||||
|
||||
import org.ansj.app.crf.pojo.Element;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class MakeTrainFile {
|
||||
|
||||
private static final Log logger = LogFactory.getLog();
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
String inputPath = "corpus.txt";
|
||||
|
||||
String outputPath = "train.txt";
|
||||
|
||||
if (args != null && args.length == 2) {
|
||||
inputPath = args[0];
|
||||
outputPath = args[1];
|
||||
}
|
||||
|
||||
if (StringUtil.isBlank(inputPath) || StringUtil.isBlank(outputPath)) {
|
||||
logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]");
|
||||
return;
|
||||
}
|
||||
try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8");
|
||||
FileOutputStream fos = new FileOutputStream(outputPath)) {
|
||||
String temp = null;
|
||||
int i = 0;
|
||||
while ((temp = reader.readLine()) != null) {
|
||||
StringBuilder sb = new StringBuilder("\n");
|
||||
if (StringUtil.isBlank(temp)) {
|
||||
continue;
|
||||
}
|
||||
if (i == 0) {
|
||||
temp = StringUtil.trim(temp);
|
||||
}
|
||||
List<Element> list = Config.makeToElementList(temp, "\\s+");
|
||||
for (Element element : list) {
|
||||
sb.append(element.nameStr() + " " + Config.getTagName(element.getTag()));
|
||||
sb.append("\n");
|
||||
}
|
||||
fos.write(sb.toString().getBytes(IOUtil.UTF8));
|
||||
System.out.println(++i);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.warn("文件没有找到", e);
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,196 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf;
|
||||
|
||||
import org.ansj.app.crf.model.CRFModel;
|
||||
import org.ansj.app.crf.model.CRFppTxtModel;
|
||||
import org.ansj.app.crf.model.WapitiCRFModel;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.MapCount;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public abstract class Model {
|
||||
|
||||
public static final Log logger = LogFactory.getLog(Model.class);
|
||||
|
||||
protected Config config;
|
||||
|
||||
protected SmartForest<float[]> featureTree = null;
|
||||
|
||||
protected float[][] status = new float[Config.TAG_NUM][Config.TAG_NUM];
|
||||
|
||||
public int allFeatureCount = 0;
|
||||
|
||||
/**
|
||||
* 判断当前数据流是否是本实例
|
||||
*
|
||||
* @param is
|
||||
* @return
|
||||
*/
|
||||
public abstract boolean checkModel(String modelPath) throws IOException;
|
||||
|
||||
/**
|
||||
* 模型读取
|
||||
*
|
||||
* @param path
|
||||
* @return
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
public static Model load(String modelPath) throws Exception {
|
||||
Model model = new CRFModel();
|
||||
if (model.checkModel(modelPath)) {
|
||||
return model.loadModel(modelPath);
|
||||
}
|
||||
model = new CRFppTxtModel();
|
||||
|
||||
if (model.checkModel(modelPath)) {
|
||||
return model.loadModel(modelPath);
|
||||
}
|
||||
model = new WapitiCRFModel();
|
||||
if (model.checkModel(modelPath)) {
|
||||
return model.loadModel(modelPath);
|
||||
}
|
||||
throw new Exception("I did not know what type of model by file " + modelPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* 模型读取
|
||||
*
|
||||
*/
|
||||
public static Model load(Class<? extends Model> modelClass, InputStream inputStream) throws Exception {
|
||||
return modelClass
|
||||
.getDeclaredConstructor()
|
||||
.newInstance()
|
||||
.loadModel(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* 不同的模型实现自己的加载模型类
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public abstract Model loadModel(String modelPath) throws Exception;
|
||||
|
||||
public abstract Model loadModel(InputStream is) throws Exception;
|
||||
|
||||
/**
|
||||
* 获得特征所在权重数组
|
||||
*
|
||||
* @param featureStr
|
||||
* @return
|
||||
*/
|
||||
public float[] getFeature(char... chars) {
|
||||
if (chars == null) {
|
||||
return null;
|
||||
}
|
||||
SmartForest<float[]> sf = featureTree;
|
||||
sf = sf.getBranch(chars);
|
||||
if (sf == null || sf.getParam() == null) {
|
||||
return null;
|
||||
}
|
||||
return sf.getParam();
|
||||
}
|
||||
|
||||
public Config getConfig() {
|
||||
return this.config;
|
||||
}
|
||||
|
||||
/**
|
||||
* tag转移率
|
||||
*
|
||||
* @param s1
|
||||
* @param s2
|
||||
* @return
|
||||
*/
|
||||
public float tagRate(int s1, int s2) {
|
||||
return status[s1][s2];
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加特征到特征数中
|
||||
*
|
||||
* @param cs
|
||||
* @param tempW
|
||||
*/
|
||||
protected static void printFeatureTree(String cs, float[] tempW) {
|
||||
String name = "*";
|
||||
if (tempW.length == 4) {
|
||||
name = "U";
|
||||
}
|
||||
name += "*" + (cs.charAt(cs.length() - 1) - Config.FEATURE_BEGIN + 1) + ":" + cs.substring(0, cs.length() - 1);
|
||||
for (int i = 0; i < tempW.length; i++) {
|
||||
if (tempW[i] != 0) {
|
||||
System.out.println(name + "\t" + Config.getTagName(i / 4 - 1) + "\t" + Config.getTagName(i % 4) + "\t"
|
||||
+ tempW[i]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将model序列化到硬盘
|
||||
*
|
||||
* @param path
|
||||
* @throws IOException
|
||||
* @throws FileNotFoundException
|
||||
*/
|
||||
public void writeModel(String path) {
|
||||
try (FileOutputStream fso = new FileOutputStream(path)) {
|
||||
ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(fso));
|
||||
oos.writeUTF(CRFModel.VERSION);
|
||||
oos.writeObject(status);
|
||||
oos.writeObject(config.getTemplate());
|
||||
Map<String, float[]> map = featureTree.toMap();
|
||||
MapCount<Integer> mc = new MapCount<>();
|
||||
for (float[] v : map.values()) {
|
||||
mc.add(v.length);
|
||||
}
|
||||
for (Entry<Integer, Double> entry : mc.get().entrySet()) {
|
||||
int win = entry.getKey();
|
||||
oos.writeInt(win);// 宽度
|
||||
oos.writeInt(entry.getValue().intValue());// 个数
|
||||
for (Entry<String, float[]> e : map.entrySet()) {
|
||||
if (e.getValue().length == win) {
|
||||
oos.writeUTF(e.getKey());
|
||||
float[] value = e.getValue();
|
||||
for (int i = 0; i < win; i++) {
|
||||
oos.writeFloat(value[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
oos.writeInt(0);
|
||||
oos.writeInt(0);
|
||||
oos.flush();
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.warn("文件没有找到", e);
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,192 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf;
|
||||
|
||||
import org.ansj.app.crf.pojo.Element;
|
||||
import org.ansj.util.MatrixUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 分词
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class SplitWord {
|
||||
|
||||
private Model model = null;
|
||||
|
||||
public SplitWord(Model model) {
|
||||
this.model = model;
|
||||
};
|
||||
|
||||
public List<String> cut(char[] chars) {
|
||||
return cut(new String(chars));
|
||||
}
|
||||
|
||||
public List<String> cut(String line) {
|
||||
|
||||
if (StringUtil.isBlank(line)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Element> elements = vterbi(line);
|
||||
|
||||
List<String> result = new ArrayList<>();
|
||||
|
||||
Element e = null;
|
||||
int begin = 0;
|
||||
int end = 0;
|
||||
int size = elements.size() - 1;
|
||||
for (int i = 0; i < elements.size(); i++) {
|
||||
e = elements.get(i);
|
||||
switch (e.getTag()) {
|
||||
case 0:
|
||||
end += e.len;
|
||||
result.add(line.substring(begin, end));
|
||||
begin = end;
|
||||
break;
|
||||
case 1:
|
||||
end += e.len;
|
||||
while (i < size && (e = elements.get(++i)).getTag() != 3) {
|
||||
end += e.len;
|
||||
}
|
||||
end += e.len;
|
||||
result.add(line.substring(begin, end));
|
||||
begin = end;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private List<Element> vterbi(String line) {
|
||||
List<Element> elements = Config.wordAlert(line);
|
||||
|
||||
int length = elements.size();
|
||||
|
||||
if (length == 0) { // 避免空list,下面get(0)操作越界
|
||||
return elements;
|
||||
}
|
||||
if (length == 1) {
|
||||
elements.get(0).updateTag(0);
|
||||
return elements;
|
||||
}
|
||||
|
||||
/**
|
||||
* 填充图
|
||||
*/
|
||||
for (int i = 0; i < length; i++) {
|
||||
computeTagScore(elements, i);
|
||||
}
|
||||
|
||||
// 如果是开始不可能从 m,e开始 ,所以将它设为一个很小的值
|
||||
elements.get(0).tagScore[2] = -1000;
|
||||
elements.get(0).tagScore[3] = -1000;
|
||||
|
||||
for (int i = 1; i < length; i++) {
|
||||
elements.get(i).maxFrom(model, elements.get(i - 1));
|
||||
}
|
||||
|
||||
// 末位置只能从S,E开始
|
||||
// 末位置只能从0,3开始
|
||||
|
||||
Element next = elements.get(elements.size() - 1);
|
||||
|
||||
Element self = null;
|
||||
|
||||
int maxStatus = next.tagScore[0] > next.tagScore[3] ? 0 : 3;
|
||||
|
||||
next.updateTag(maxStatus);
|
||||
|
||||
maxStatus = next.from[maxStatus];
|
||||
|
||||
// 逆序寻找
|
||||
for (int i = elements.size() - 2; i > 0; i--) {
|
||||
self = elements.get(i);
|
||||
self.updateTag(maxStatus);
|
||||
maxStatus = self.from[self.getTag()];
|
||||
next = self;
|
||||
}
|
||||
elements.get(0).updateTag(maxStatus);
|
||||
|
||||
// printElements(elements) ;
|
||||
|
||||
return elements;
|
||||
|
||||
}
|
||||
|
||||
private void computeTagScore(List<Element> elements, int index) {
|
||||
|
||||
char[][] feautres = model.getConfig().makeFeatureArr(elements, index);
|
||||
|
||||
//TODO: set 20 很大吧!
|
||||
float[] tagScore = new float[20]; //Config.TAG_NUM*Config.TAG_NUM+Config.TAG_NUM
|
||||
|
||||
for (int i = 0; i < feautres.length; i++) {
|
||||
MatrixUtil.dot(tagScore, model.getFeature(feautres[i]));
|
||||
}
|
||||
|
||||
elements.get(index).tagScore = tagScore;
|
||||
}
|
||||
|
||||
/**
|
||||
* 随便给一个词。计算这个词的内聚分值,可以理解为计算这个词的可信度
|
||||
*
|
||||
* @param word
|
||||
*/
|
||||
public float cohesion(String word) {
|
||||
|
||||
if (word.length() == 0) {
|
||||
return Integer.MIN_VALUE;
|
||||
}
|
||||
|
||||
List<Element> elements = Config.wordAlert(word);
|
||||
|
||||
for (int i = 0; i < elements.size(); i++) {
|
||||
computeTagScore(elements, i);
|
||||
}
|
||||
|
||||
float value = elements.get(0).tagScore[1];
|
||||
|
||||
int len = elements.size() - 1;
|
||||
|
||||
for (int i = 1; i < len; i++) {
|
||||
value += elements.get(i).tagScore[2];
|
||||
}
|
||||
|
||||
value += elements.get(len).tagScore[3];
|
||||
|
||||
if (value < 0) {
|
||||
return 1;
|
||||
} else {
|
||||
value += 1;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf.model;
|
||||
|
||||
import org.ansj.app.crf.Config;
|
||||
import org.ansj.app.crf.Model;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.ZipException;
|
||||
|
||||
public class CRFModel extends Model {
|
||||
|
||||
public static final String VERSION = "ansj1";
|
||||
|
||||
@Override
|
||||
public CRFModel loadModel(String modelPath) throws Exception {
|
||||
try (InputStream is = IOUtil.getInputStream(modelPath)) {
|
||||
loadModel(is);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public CRFModel loadModel(InputStream is) throws Exception {
|
||||
long start = System.currentTimeMillis();
|
||||
try (ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(is))) {
|
||||
ois.readUTF();
|
||||
this.status = (float[][]) ois.readObject();
|
||||
int[][] template = (int[][]) ois.readObject();
|
||||
this.config = new Config(template);
|
||||
int win = 0;
|
||||
int size = 0;
|
||||
String name = null;
|
||||
featureTree = new SmartForest<float[]>();
|
||||
float[] value = null;
|
||||
do {
|
||||
win = ois.readInt();
|
||||
size = ois.readInt();
|
||||
for (int i = 0; i < size; i++) {
|
||||
name = ois.readUTF();
|
||||
value = new float[win];
|
||||
for (int j = 0; j < value.length; j++) {
|
||||
value[j] = ois.readFloat();
|
||||
}
|
||||
featureTree.add(name, value);
|
||||
}
|
||||
} while (win == 0 || size == 0);
|
||||
logger.info("load crf model ok ! use time :" + (System.currentTimeMillis() - start));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean checkModel(String modelPath) {
|
||||
try (FileInputStream fis = new FileInputStream(modelPath)) {
|
||||
ObjectInputStream inputStream = new ObjectInputStream(new GZIPInputStream(fis));
|
||||
String version = inputStream.readUTF();
|
||||
if (version.equals("ansj1")) { // 加载ansj,model
|
||||
return true;
|
||||
}
|
||||
} catch (ZipException ze) {
|
||||
logger.warn("解压异常", ze);
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.warn("文件没有找到", e);
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,332 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf.model;
|
||||
|
||||
import org.ansj.app.crf.Config;
|
||||
import org.ansj.app.crf.Model;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.ObjConver;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.tuples.Pair;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.*;
|
||||
|
||||
public class CRFppTxtModel extends Model {
|
||||
|
||||
/**
|
||||
* 解析crf++生成的可可视txt文件
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public CRFppTxtModel loadModel(String modelPath) throws Exception {
|
||||
try (InputStream is = new FileInputStream(modelPath)) {
|
||||
loadModel(new FileInputStream(modelPath));
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Model loadModel(InputStream is) throws Exception {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
BufferedReader reader = IOUtil.getReader(is, IOUtil.UTF8);
|
||||
|
||||
reader.readLine();// version
|
||||
reader.readLine();// cost-factor
|
||||
|
||||
// int maxId =
|
||||
// Integer.parseInt(reader.readLine().split(":")[1].trim());// read
|
||||
reader.readLine();// xsize
|
||||
reader.readLine(); // line
|
||||
int[] statusCoven = loadTagCoven(reader);
|
||||
Map<String, Integer> featureIndex = loadConfig(reader);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int[] t1 : config.getTemplate()) {
|
||||
sb.append(Arrays.toString(t1) + " ");
|
||||
}
|
||||
logger.info("load template ok template : " + sb);
|
||||
TreeMap<Integer, Pair<String, String>> featureNames = loadFeatureName(featureIndex, reader);
|
||||
logger.info("load feature ok feature size : " + featureNames.size());
|
||||
loadFeatureWeight(reader, statusCoven, featureNames);
|
||||
logger.info("load crfpp model ok ! use time : " + (System.currentTimeMillis() - start));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征值 //11:*6:_x-1/的,
|
||||
*
|
||||
* @param maxId
|
||||
*
|
||||
* @param featureIndex
|
||||
*
|
||||
* @param br
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
|
||||
private TreeMap<Integer, Pair<String, String>> loadFeatureName(Map<String, Integer> featureIndex, BufferedReader br)
|
||||
throws Exception {
|
||||
|
||||
TreeMap<Integer, Pair<String, String>> featureNames = new TreeMap<>();
|
||||
|
||||
String temp = null;
|
||||
while (StringUtil.isNotBlank(temp = br.readLine())) {
|
||||
|
||||
int indexOf = temp.indexOf(" ");
|
||||
|
||||
int id = ObjConver.getIntValue(temp.substring(0, indexOf));
|
||||
|
||||
if (indexOf > 0) {
|
||||
temp = temp.substring(indexOf);
|
||||
}
|
||||
|
||||
String[] split = temp.split(":");
|
||||
|
||||
if (split.length == 1) {
|
||||
featureNames.put(id, Pair.with(temp.trim(), ""));
|
||||
} else {
|
||||
String name = split[1];
|
||||
if (split.length > 2) {
|
||||
for (int j = 2; j < split.length; j++) {
|
||||
name += ":" + split[j];
|
||||
}
|
||||
}
|
||||
|
||||
int lastFeatureId = featureIndex.get(split[0].trim());
|
||||
|
||||
if ("/".equals(name)) {
|
||||
name = "//";
|
||||
}
|
||||
|
||||
if (name.contains("//")) {
|
||||
name = name.replaceAll("//", "/XIEGANG/");
|
||||
}
|
||||
String featureName = toFeatureName(name.trim().split("/"), lastFeatureId);
|
||||
|
||||
featureNames.put(id, Pair.with(split[0].trim(), featureName));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return featureNames;
|
||||
|
||||
}
|
||||
|
||||
private String toFeatureName(String[] split, int lastFeatureId) throws Exception {
|
||||
|
||||
StringBuilder result = new StringBuilder();
|
||||
|
||||
for (String str : split) {
|
||||
if ("".equals(str)) {
|
||||
continue;
|
||||
} else if (str.length() == 1) {
|
||||
result.append(str.charAt(0));
|
||||
} else if (str.equals("XIEGANG")) {
|
||||
result.append('/');
|
||||
} else if (str.startsWith("num")) {
|
||||
result.append((char) (Config.NUM_BEGIN + ObjConver.getIntValue(str.replace("num", ""))));
|
||||
} else if (str.startsWith("en")) {
|
||||
result.append((char) (Config.EN_BEGIN + ObjConver.getIntValue(str.replace("en", ""))));
|
||||
} else if (str.startsWith("_B-")) {
|
||||
result.append(Config.BEGIN);
|
||||
} else if (str.startsWith("_B+")) {
|
||||
result.append(Config.END);
|
||||
} else {
|
||||
throw new Exception("can find feature named " + str + " in " + Arrays.toString(split));
|
||||
}
|
||||
}
|
||||
|
||||
result.append((char) (lastFeatureId + Config.FEATURE_BEGIN));
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征权重
|
||||
*
|
||||
* @param br
|
||||
* @param featureNames
|
||||
* @param statusCoven
|
||||
* @throws Exception
|
||||
*/
|
||||
private void loadFeatureWeight(BufferedReader br, int[] statusCoven,
|
||||
TreeMap<Integer, Pair<String, String>> featureNames) throws Exception {
|
||||
|
||||
featureTree = new SmartForest<float[]>();
|
||||
|
||||
int tag = 0; // 赏析按标签为用来转换
|
||||
|
||||
int len = 0; // 权重数组的大小
|
||||
|
||||
String name = null; // 特征名称
|
||||
|
||||
float[] tempW = null; // 每一个特征的权重
|
||||
|
||||
String temp = null;
|
||||
|
||||
for (Pair<String, String> pair : featureNames.values()) {
|
||||
|
||||
char fc = Character.toUpperCase(pair.getValue0().charAt(0));
|
||||
|
||||
len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM
|
||||
: fc == 'U' ? Config.TAG_NUM
|
||||
: fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0;
|
||||
|
||||
if (len == 0) {
|
||||
throw new Exception("unknow feature type " + pair.getValue0());
|
||||
}
|
||||
|
||||
if (fc == 'B') { // 特殊处理转换特征数组
|
||||
for (int i = 0; i < len; i++) {
|
||||
temp = br.readLine();
|
||||
int from = statusCoven[i / Config.TAG_NUM];
|
||||
int to = statusCoven[i % Config.TAG_NUM];
|
||||
status[from][to] = ObjConver.getFloatValue(temp);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
name = pair.getValue1();
|
||||
|
||||
tempW = new float[len];
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
temp = br.readLine();
|
||||
tag = statusCoven[i];
|
||||
tempW[tag] = ObjConver.getFloatValue(temp);
|
||||
}
|
||||
this.featureTree.add(name, tempW); // 将特征增加到特征🌲中
|
||||
|
||||
// printFeatureTree(name, tempW);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征标签转换
|
||||
*
|
||||
* @param br
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
private int[] loadTagCoven(BufferedReader br) throws Exception {
|
||||
|
||||
int[] conver = new int[Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM];
|
||||
|
||||
String temp = null;
|
||||
|
||||
// TODO: 这个是个写死的过程,如果标签发生改变需要重新来写这里
|
||||
for (int i = 0; i < Config.TAG_NUM; i++) {
|
||||
String line = br.readLine();
|
||||
if (StringUtil.isBlank(line)) {
|
||||
i--;
|
||||
continue;
|
||||
}
|
||||
|
||||
char c = line.charAt(0);
|
||||
switch (c) {
|
||||
case 'S':
|
||||
conver[i] = Config.S;
|
||||
break;
|
||||
case 'B':
|
||||
conver[i] = Config.B;
|
||||
break;
|
||||
case 'M':
|
||||
conver[i] = Config.M;
|
||||
break;
|
||||
case 'E':
|
||||
conver[i] = Config.E;
|
||||
break;
|
||||
default:
|
||||
throw new Exception("err tag named " + c + " in model " + temp);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = Config.TAG_NUM; i < conver.length; i++) {
|
||||
conver[i] = conver[(i - 4) / Config.TAG_NUM] * Config.TAG_NUM + conver[i % Config.TAG_NUM] + Config.TAG_NUM;
|
||||
}
|
||||
|
||||
return conver;
|
||||
}
|
||||
|
||||
private Map<String, Integer> loadConfig(BufferedReader br) throws IOException {
|
||||
|
||||
Map<String, Integer> featureIndex = new HashMap<>();
|
||||
|
||||
String temp = br.readLine();// #rdr#8/0/0
|
||||
|
||||
List<int[]> list = new ArrayList<>();
|
||||
|
||||
while (StringUtil.isNotBlank((temp = br.readLine()))) {
|
||||
|
||||
List<String> matcherAll = StringUtil.matcherAll("\\[.*?\\]", temp);
|
||||
|
||||
if (matcherAll.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int[] is = new int[matcherAll.size()];
|
||||
for (int j = 0; j < is.length; j++) {
|
||||
is[j] = ObjConver.getIntValue(StringUtil.matcherFirst("[-\\d]+", matcherAll.get(j)));
|
||||
}
|
||||
|
||||
featureIndex.put(temp.split(":")[0].trim(), list.size());
|
||||
|
||||
list.add(is);
|
||||
}
|
||||
|
||||
int[][] template = new int[list.size()][0]; // 构建特征模板
|
||||
|
||||
for (int i = 0; i < template.length; i++) {
|
||||
template[i] = list.get(i);
|
||||
}
|
||||
|
||||
config = new Config(template);
|
||||
|
||||
return featureIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean checkModel(String modelPath) {
|
||||
|
||||
try (InputStream is = IOUtil.getInputStream(modelPath)) {
|
||||
byte[] bytes = new byte[100];
|
||||
is.read(bytes);
|
||||
String string = new String(bytes);
|
||||
if (string.startsWith("version")) { // 加载crf++ 的txt类型的modle
|
||||
return true;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,360 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf.model;
|
||||
|
||||
import org.ansj.app.crf.Config;
|
||||
import org.ansj.app.crf.Model;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.ObjConver;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.tuples.Pair;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.*;
|
||||
|
||||
public class WapitiCRFModel extends Model {
|
||||
|
||||
@Override
|
||||
public WapitiCRFModel loadModel(String modelPath) throws Exception {
|
||||
try (InputStream is = IOUtil.getInputStream(modelPath)) {
|
||||
return loadModel(is);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public WapitiCRFModel loadModel(InputStream is) throws Exception {
|
||||
BufferedReader br = IOUtil.getReader(is, IOUtil.UTF8);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
logger.info("load wapiti model begin!");
|
||||
|
||||
String temp = br.readLine();
|
||||
|
||||
logger.info(temp); // #mdl#2#123
|
||||
|
||||
Map<String, Integer> featureIndex = loadConfig(br);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int[] t1 : config.getTemplate()) {
|
||||
sb.append(Arrays.toString(t1) + " ");
|
||||
}
|
||||
|
||||
logger.info("featureIndex is " + featureIndex);
|
||||
logger.info("load template ok template : " + sb);
|
||||
|
||||
int[] statusCoven = loadTagCoven(br);
|
||||
|
||||
List<Pair<String, String>> loadFeatureName = loadFeatureName(featureIndex, br);
|
||||
|
||||
logger.info("load feature ok feature size : " + loadFeatureName.size());
|
||||
|
||||
featureTree = new SmartForest<float[]>();
|
||||
|
||||
loadFeatureWeight(br, statusCoven, loadFeatureName);
|
||||
|
||||
logger.info("load wapiti model ok ! use time :" + (System.currentTimeMillis() - start));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征权重
|
||||
*
|
||||
* @param br
|
||||
* @param featureNames
|
||||
* @param statusCoven
|
||||
* @throws Exception
|
||||
*/
|
||||
private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List<Pair<String, String>> featureNames)
|
||||
throws Exception {
|
||||
|
||||
int key = 0;
|
||||
|
||||
int offe = 0;
|
||||
|
||||
int tag = 0; // 赏析按标签为用来转换
|
||||
|
||||
int len = 0; // 权重数组的大小
|
||||
|
||||
int min, max = 0; // 设置边界
|
||||
|
||||
String name = null; // 特征名称
|
||||
|
||||
float[] tempW = null; // 每一个特征的权重
|
||||
|
||||
String temp = br.readLine();
|
||||
|
||||
for (Pair<String, String> pair : featureNames) {
|
||||
|
||||
if (temp == null) {
|
||||
logger.warn(pair.getValue0() + "\t" + pair.getValue1() + " not have any weight ,so skip it !");
|
||||
continue;
|
||||
}
|
||||
|
||||
char fc = Character.toUpperCase(pair.getValue0().charAt(0));
|
||||
|
||||
len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM
|
||||
: fc == 'U' ? Config.TAG_NUM
|
||||
: fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0;
|
||||
|
||||
if (len == 0) {
|
||||
throw new Exception("unknow feature type " + pair.getValue0());
|
||||
}
|
||||
|
||||
min = max;
|
||||
max += len;
|
||||
if (fc == 'B') { // 特殊处理转换特征数组
|
||||
for (int i = 0; i < len; i++) {
|
||||
String[] split = temp.split("=");
|
||||
int from = statusCoven[i / Config.TAG_NUM];
|
||||
int to = statusCoven[i % Config.TAG_NUM];
|
||||
status[from][to] = ObjConver.getFloatValue(split[1]);
|
||||
temp = br.readLine();
|
||||
}
|
||||
} else {
|
||||
|
||||
name = pair.getValue1();
|
||||
|
||||
tempW = new float[len];
|
||||
|
||||
do {
|
||||
String[] split = temp.split("=");
|
||||
|
||||
key = ObjConver.getIntValue(split[0]);
|
||||
|
||||
if (key >= max) { // 如果超过边界那么跳出
|
||||
break;
|
||||
}
|
||||
|
||||
offe = key - min;
|
||||
|
||||
tag = statusCoven[offe];
|
||||
|
||||
tempW[tag] = ObjConver.getFloatValue(split[1]);
|
||||
|
||||
} while ((temp = br.readLine()) != null);
|
||||
|
||||
this.featureTree.add(name, tempW); // 将特征增加到特征🌲中
|
||||
|
||||
// printFeatureTree(name, tempW);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征值 //11:*6:_x-1/的,
|
||||
*
|
||||
* @param featureIndex
|
||||
*
|
||||
* @param br
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
|
||||
private List<Pair<String, String>> loadFeatureName(Map<String, Integer> featureIndex, BufferedReader br)
|
||||
throws Exception {
|
||||
String temp = br.readLine();// #qrk#num
|
||||
int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数
|
||||
|
||||
List<Pair<String, String>> featureNames = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < featureNum; i++) {
|
||||
temp = br.readLine();
|
||||
|
||||
String[] split = temp.split(":");
|
||||
|
||||
if (split.length == 2) {
|
||||
featureNames.add(Pair.with(split[1], ""));
|
||||
continue;
|
||||
} else {
|
||||
|
||||
String name = split[2];
|
||||
|
||||
if (split.length > 3) {
|
||||
for (int j = 3; j < split.length; j++) {
|
||||
name += ":" + split[j];
|
||||
}
|
||||
}
|
||||
|
||||
// 去掉最后的空格
|
||||
name = name.substring(0, name.length() - 1);
|
||||
|
||||
int lastFeatureId = featureIndex.get(split[1]);
|
||||
|
||||
if ("/".equals(name)) {
|
||||
name = "//";
|
||||
}
|
||||
|
||||
if (name.contains("//")) {
|
||||
name = name.replaceAll("//", "/XIEGANG/");
|
||||
}
|
||||
String featureName = toFeatureName(name.trim().split("/"), lastFeatureId);
|
||||
|
||||
featureNames.add(Pair.with(split[1], featureName));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return featureNames;
|
||||
|
||||
}
|
||||
|
||||
private String toFeatureName(String[] split, int lastFeatureId) throws Exception {
|
||||
|
||||
StringBuilder result = new StringBuilder();
|
||||
|
||||
for (String str : split) {
|
||||
if ("".equals(str)) {
|
||||
continue;
|
||||
} else if (str.length() == 1) {
|
||||
result.append(str.charAt(0));
|
||||
} else if (str.equals("XIEGANG")) {
|
||||
result.append('/');
|
||||
} else if (str.startsWith("num")) {
|
||||
result.append((char) (Config.NUM_BEGIN + ObjConver.getIntValue(str.replace("num", ""))));
|
||||
} else if (str.startsWith("en")) {
|
||||
result.append((char) (Config.EN_BEGIN + ObjConver.getIntValue(str.replace("en", ""))));
|
||||
} else if (str.startsWith("_x-")) {
|
||||
result.append(Config.BEGIN);
|
||||
} else if (str.startsWith("_x+")) {
|
||||
result.append(Config.END);
|
||||
} else {
|
||||
throw new Exception("can find feature named " + str + " in " + Arrays.toString(split));
|
||||
}
|
||||
}
|
||||
|
||||
result.append((char) (lastFeatureId + Config.FEATURE_BEGIN));
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征标签转换
|
||||
*
|
||||
* @param br
|
||||
* @return
|
||||
* @throws Exception
|
||||
*/
|
||||
private int[] loadTagCoven(BufferedReader br) throws Exception {
|
||||
|
||||
int[] conver = new int[Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM];
|
||||
|
||||
String temp = br.readLine();// #qrk#4
|
||||
|
||||
// TODO: 这个是个写死的过程,如果标签发生改变需要重新来写这里
|
||||
for (int i = 0; i < Config.TAG_NUM; i++) {
|
||||
char c = br.readLine().split(":")[1].charAt(0);
|
||||
switch (c) {
|
||||
case 'S':
|
||||
conver[i] = Config.S;
|
||||
break;
|
||||
case 'B':
|
||||
conver[i] = Config.B;
|
||||
break;
|
||||
case 'M':
|
||||
conver[i] = Config.M;
|
||||
break;
|
||||
case 'E':
|
||||
conver[i] = Config.E;
|
||||
break;
|
||||
default:
|
||||
throw new Exception("err tag named " + c + " in model " + temp);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = Config.TAG_NUM; i < conver.length; i++) {
|
||||
conver[i] = conver[(i - 4) / Config.TAG_NUM] * Config.TAG_NUM + conver[i % Config.TAG_NUM] + Config.TAG_NUM;
|
||||
}
|
||||
|
||||
return conver;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载特征模板
|
||||
*
|
||||
* @param br
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private Map<String, Integer> loadConfig(BufferedReader br) throws IOException {
|
||||
|
||||
Map<String, Integer> featureIndex = new HashMap<>();
|
||||
|
||||
String temp = br.readLine();// #rdr#8/0/0
|
||||
|
||||
int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数
|
||||
|
||||
List<int[]> list = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < featureNum; i++) {
|
||||
temp = br.readLine();
|
||||
|
||||
List<String> matcherAll = StringUtil.matcherAll("\\[.*?\\]", temp);
|
||||
|
||||
if (matcherAll.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int[] is = new int[matcherAll.size()];
|
||||
for (int j = 0; j < is.length; j++) {
|
||||
is[j] = ObjConver.getIntValue(StringUtil.matcherFirst("[-\\d]+", matcherAll.get(j)));
|
||||
}
|
||||
|
||||
featureIndex.put(temp.split(":")[1], list.size());
|
||||
|
||||
list.add(is);
|
||||
}
|
||||
|
||||
int[][] template = new int[list.size()][0]; // 构建特征模板
|
||||
|
||||
for (int i = 0; i < template.length; i++) {
|
||||
template[i] = list.get(i);
|
||||
}
|
||||
|
||||
config = new Config(template);
|
||||
|
||||
return featureIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean checkModel(String modelPath) {
|
||||
|
||||
try (InputStream is = IOUtil.getInputStream(modelPath)) {
|
||||
byte[] bytes = new byte[100];
|
||||
|
||||
is.read(bytes);
|
||||
|
||||
String string = new String(bytes);
|
||||
if (string.startsWith("#mdl#")) { // 加载crf++ 的txt类型的modle
|
||||
return true;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,110 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.crf.pojo;
|
||||
|
||||
import org.ansj.app.crf.Config;
|
||||
import org.ansj.app.crf.Model;
|
||||
|
||||
public class Element {
|
||||
|
||||
public char name;
|
||||
private int tag = -1;
|
||||
public int len = 1;
|
||||
public String nature;
|
||||
|
||||
public float[] tagScore;
|
||||
|
||||
public int[] from;
|
||||
|
||||
public Element(char name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public Element(Character name, int tag) {
|
||||
this.name = name;
|
||||
this.tag = tag;
|
||||
}
|
||||
|
||||
public int getTag() {
|
||||
return tag;
|
||||
}
|
||||
|
||||
public Element updateTag(int tag) {
|
||||
this.tag = tag;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Element updateNature(String nature) {
|
||||
this.nature = nature;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return name + "/" + len + "/" + tag;
|
||||
}
|
||||
|
||||
public char getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得可见的名称
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String nameStr() {
|
||||
if (name >= 130 && name < 140) {
|
||||
return ("num" + (name - 130));
|
||||
} else if (name >= 140 && name < 150) {
|
||||
return ("en" + (name - 140));
|
||||
} else {
|
||||
return String.valueOf(name);
|
||||
}
|
||||
}
|
||||
|
||||
public void maxFrom(Model model, Element element) {
|
||||
if (from == null) {
|
||||
from = new int[Config.TAG_NUM];
|
||||
}
|
||||
float[] pTagScore = element.tagScore;
|
||||
for (int i = 0; i < Config.TAG_NUM; i++) {
|
||||
float maxValue = 0;
|
||||
for (int j = 0; j < Config.TAG_NUM; j++) {
|
||||
|
||||
float value = (pTagScore[j] + tagScore[i]) + model.tagRate(j, i);
|
||||
|
||||
if (tagScore.length > Config.TAG_NUM) {
|
||||
value += tagScore[Config.TAG_NUM + j * Config.TAG_NUM + i];
|
||||
}
|
||||
|
||||
if (value > maxValue) {
|
||||
maxValue = value;
|
||||
from[i] = j;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
tagScore[i] = maxValue;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,163 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.keyword;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.splitWord.Analysis;
|
||||
import org.ansj.splitWord.analysis.NlpAnalysis;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class KeyWordComputer<T extends Analysis> {
|
||||
|
||||
private static final Map<String, Double> POS_SCORE = new HashMap<>();
|
||||
private T analysisType;
|
||||
|
||||
|
||||
static {
|
||||
POS_SCORE.put("null", 0.0);
|
||||
POS_SCORE.put("w", 0.0);
|
||||
POS_SCORE.put("en", 0.0);
|
||||
POS_SCORE.put("m", 0.0);
|
||||
POS_SCORE.put("num", 0.0);
|
||||
POS_SCORE.put("nr", 3.0);
|
||||
POS_SCORE.put("nrf", 3.0);
|
||||
POS_SCORE.put("nw", 3.0);
|
||||
POS_SCORE.put("nt", 3.0);
|
||||
POS_SCORE.put("l", 0.2);
|
||||
POS_SCORE.put("a", 0.2);
|
||||
POS_SCORE.put("nz", 3.0);
|
||||
POS_SCORE.put("v", 0.2);
|
||||
POS_SCORE.put("kw", 6.0); //关键词词性
|
||||
}
|
||||
|
||||
private int nKeyword = 5;
|
||||
|
||||
|
||||
public KeyWordComputer() {}
|
||||
|
||||
public void setAnalysisType(T analysisType) {
|
||||
this.analysisType = analysisType;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 返回关键词个数
|
||||
*
|
||||
* @param nKeyword
|
||||
*/
|
||||
public KeyWordComputer(int nKeyword) {
|
||||
this.nKeyword = nKeyword;
|
||||
this.analysisType = (T) new NlpAnalysis();//默认使用NLP的分词方式
|
||||
|
||||
}
|
||||
|
||||
public KeyWordComputer(int nKeyword, T analysisType) {
|
||||
this.nKeyword = nKeyword;
|
||||
this.analysisType = analysisType;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param content 正文
|
||||
* @return
|
||||
*/
|
||||
private List<Keyword> computeArticleTfidf(String content, int titleLength) {
|
||||
Map<String, Keyword> tm = new HashMap<>();
|
||||
|
||||
List<Term> parse = analysisType.parseStr(content).getTerms();
|
||||
//FIXME: 这个依赖于用户自定义词典的词性,所以得需要另一个方法..
|
||||
// parse = FilterModifWord.updateNature(parse) ;
|
||||
|
||||
for (Term term : parse) {
|
||||
double weight = getWeight(term, content.length(), titleLength);
|
||||
if (weight == 0)
|
||||
continue;
|
||||
|
||||
Keyword keyword = tm.get(term.getName());
|
||||
|
||||
|
||||
if (keyword == null) {
|
||||
keyword = new Keyword(term.getName(), term.natrue().allFrequency, weight);
|
||||
tm.put(term.getName(), keyword);
|
||||
} else {
|
||||
keyword.updateWeight(1);
|
||||
}
|
||||
}
|
||||
|
||||
TreeSet<Keyword> treeSet = new TreeSet<>(tm.values());
|
||||
|
||||
ArrayList<Keyword> arrayList = new ArrayList<>(treeSet);
|
||||
if (treeSet.size() <= nKeyword) {
|
||||
return arrayList;
|
||||
} else {
|
||||
return arrayList.subList(0, nKeyword);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @param title 标题
|
||||
* @param content 正文
|
||||
* @return
|
||||
*/
|
||||
public List<Keyword> computeArticleTfidf(String title, String content) {
|
||||
if (StringUtil.isBlank(title)) {
|
||||
title = "";
|
||||
}
|
||||
if (StringUtil.isBlank(content)) {
|
||||
content = "";
|
||||
}
|
||||
return computeArticleTfidf(title + "\t" + content, title.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* 只有正文
|
||||
*
|
||||
* @param content
|
||||
* @return
|
||||
*/
|
||||
public List<Keyword> computeArticleTfidf(String content) {
|
||||
return computeArticleTfidf(content, 0);
|
||||
}
|
||||
|
||||
private double getWeight(Term term, int length, int titleLength) {
|
||||
if (term.getName().trim().length() < 2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
String pos = term.natrue().natureStr;
|
||||
|
||||
Double posScore = POS_SCORE.get(pos);
|
||||
|
||||
if (posScore == null) {
|
||||
posScore = 1.0;
|
||||
} else if (posScore == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (titleLength > term.getOffe()) {
|
||||
return 5 * posScore;
|
||||
}
|
||||
return (length - term.getOffe()) * posScore / length;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.keyword;
|
||||
|
||||
public class Keyword implements Comparable<Keyword> {
|
||||
private String name;
|
||||
private double score;
|
||||
private double idf;
|
||||
private int freq;
|
||||
|
||||
public Keyword(String name, int docFreq, double weight) {
|
||||
this.name = name;
|
||||
this.idf = Math.log(1 + 10000.0 / (docFreq + 1));
|
||||
this.score = idf * weight;
|
||||
freq++;
|
||||
}
|
||||
|
||||
public Keyword(String name, double score) {
|
||||
this.name = name;
|
||||
this.score = score;
|
||||
this.idf = score;
|
||||
freq++;
|
||||
}
|
||||
|
||||
public void updateWeight(int weight) {
|
||||
this.score += weight * idf;
|
||||
freq++;
|
||||
}
|
||||
|
||||
public int getFreq() {
|
||||
return freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Keyword o) {
|
||||
if (this.score < o.score) {
|
||||
return 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (obj instanceof Keyword) {
|
||||
Keyword k = (Keyword) obj;
|
||||
return k.name.equals(name);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return name + "/" + score;// "="+score+":"+freq+":"+idf;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public void setScore(double score) {
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author 崇伟峰
|
||||
*
|
||||
*/
|
||||
package org.ansj.app.keyword;
|
|
@ -1,332 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.summary;
|
||||
|
||||
import org.ansj.app.keyword.KeyWordComputer;
|
||||
import org.ansj.app.keyword.Keyword;
|
||||
import org.ansj.app.summary.pojo.Summary;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.splitWord.analysis.NlpAnalysis;
|
||||
import org.nlpcn.commons.lang.tire.SmartGetWord;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.MapCount;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 自动摘要,同时返回关键词
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class SummaryComputer {
|
||||
|
||||
private static final Set<String> FILTER_SET = new HashSet<>();
|
||||
|
||||
static {
|
||||
FILTER_SET.add("w");
|
||||
FILTER_SET.add("null");
|
||||
}
|
||||
|
||||
/**
|
||||
* summaryLength
|
||||
*/
|
||||
private int len = 300;
|
||||
|
||||
private boolean isSplitSummary = true;
|
||||
|
||||
String title, content;
|
||||
|
||||
public SummaryComputer(String title, String content) {
|
||||
this.title = title;
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public SummaryComputer(int len, String title, String content) {
|
||||
this.len = len;
|
||||
this.title = title;
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public SummaryComputer(int len, boolean isSplitSummary, String title, String content) {
|
||||
this.len = len;
|
||||
this.title = title;
|
||||
this.content = content;
|
||||
this.isSplitSummary = isSplitSummary;
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算摘要,利用关键词抽取计算
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Summary toSummary() {
|
||||
return toSummary(new ArrayList<Keyword>());
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据用户查询串计算摘要
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Summary toSummary(String query) {
|
||||
|
||||
List<Term> parse = NlpAnalysis.parse(query).getTerms();
|
||||
|
||||
List<Keyword> keywords = new ArrayList<>();
|
||||
for (Term term : parse) {
|
||||
if (FILTER_SET.contains(term.natrue().natureStr)) {
|
||||
continue;
|
||||
}
|
||||
keywords.add(new Keyword(term.getName(), term.termNatures().allFreq, 1));
|
||||
}
|
||||
|
||||
return toSummary(keywords);
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算摘要,传入用户自己算好的关键词
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Summary toSummary(List<Keyword> keywords) {
|
||||
|
||||
if (keywords == null) {
|
||||
keywords = new ArrayList<>();
|
||||
}
|
||||
|
||||
if (keywords.isEmpty()) {
|
||||
|
||||
KeyWordComputer kc = new KeyWordComputer(10);
|
||||
keywords = kc.computeArticleTfidf(title, content);
|
||||
}
|
||||
return explan(keywords, content);
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算摘要
|
||||
*
|
||||
* @param keyword
|
||||
* @param content
|
||||
* @return
|
||||
*/
|
||||
private Summary explan(List<Keyword> keywords, String content) {
|
||||
|
||||
SmartForest<Double> sf = new SmartForest<>();
|
||||
|
||||
for (Keyword keyword : keywords) {
|
||||
sf.add(keyword.getName(), keyword.getScore());
|
||||
}
|
||||
|
||||
// 先断句
|
||||
List<Sentence> sentences = toSentenceList(content.toCharArray());
|
||||
|
||||
for (Sentence sentence : sentences) {
|
||||
computeScore(sentence, sf);
|
||||
}
|
||||
|
||||
double maxScore = 0;
|
||||
int maxIndex = 0;
|
||||
|
||||
MapCount<String> mc = new MapCount<>();
|
||||
|
||||
for (int i = 0; i < sentences.size(); i++) {
|
||||
double tempScore = sentences.get(i).score;
|
||||
int tempLength = sentences.get(i).value.length();
|
||||
mc.addAll(sentences.get(i).mc.get());
|
||||
|
||||
if (tempLength >= len) {
|
||||
tempScore = tempScore * mc.get().size();
|
||||
if (maxScore < tempScore) {
|
||||
maxScore = tempScore;
|
||||
maxIndex = i;
|
||||
continue;
|
||||
}
|
||||
mc.get().clear();
|
||||
}
|
||||
for (int j = i + 1; j < sentences.size(); j++) {
|
||||
tempScore += sentences.get(j).score;
|
||||
tempLength += sentences.get(j).value.length();
|
||||
mc.addAll(sentences.get(j).mc.get());
|
||||
|
||||
if (tempLength >= len) {
|
||||
tempScore = tempScore * mc.get().size();
|
||||
if (maxScore < tempScore) {
|
||||
maxScore = tempScore;
|
||||
maxIndex = i;
|
||||
}
|
||||
mc.get().clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (tempLength < len) {
|
||||
tempScore = tempScore * mc.get().size();
|
||||
if (maxScore < tempScore) {
|
||||
maxScore = tempScore;
|
||||
maxIndex = i;
|
||||
break;
|
||||
}
|
||||
mc.get().clear();
|
||||
}
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = maxIndex; i < sentences.size(); i++) {
|
||||
sb.append(sentences.get(i).value);
|
||||
if (sb.length() > len) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
String summaryStr = sb.toString();
|
||||
|
||||
/**
|
||||
* 是否强制文本长度。对于abc这种字符算半个长度
|
||||
*/
|
||||
|
||||
if (isSplitSummary && sb.length() > len) {
|
||||
double value = len;
|
||||
|
||||
StringBuilder newSummary = new StringBuilder();
|
||||
char c = 0;
|
||||
for (int i = 0; i < sb.length(); i++) {
|
||||
c = sb.charAt(i);
|
||||
if (c < 256) {
|
||||
value -= 0.5;
|
||||
} else {
|
||||
value -= 1;
|
||||
}
|
||||
|
||||
if (value < 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
newSummary.append(c);
|
||||
}
|
||||
|
||||
summaryStr = newSummary.toString();
|
||||
}
|
||||
|
||||
return new Summary(keywords, summaryStr);
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算一个句子的分数
|
||||
*
|
||||
* @param sentence
|
||||
* @param sf
|
||||
*/
|
||||
private void computeScore(Sentence sentence, SmartForest<Double> forest) {
|
||||
SmartGetWord<Double> sgw = new SmartGetWord<>(forest, sentence.value);
|
||||
String name = null;
|
||||
while ((name = sgw.getFrontWords()) != null) {
|
||||
sentence.updateScore(name, sgw.getParam());
|
||||
}
|
||||
if (sentence.score == 0) {
|
||||
sentence.score = sentence.value.length() * -0.005;
|
||||
} else {
|
||||
sentence.score /= Math.log(sentence.value.length() + 3);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Sentence> toSentenceList(char[] chars) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
List<Sentence> sentences = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
sb.append(chars[i]);
|
||||
switch (chars[i]) {
|
||||
case '.':
|
||||
if (i < chars.length - 1 && chars[i + 1] > 128) {
|
||||
insertIntoList(sb, sentences);
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
break;
|
||||
//case ' ':
|
||||
case ' ':
|
||||
case ' ':
|
||||
case ' ':
|
||||
case ',':
|
||||
case '。':
|
||||
case ';':
|
||||
case ';':
|
||||
case '!':
|
||||
case '!':
|
||||
case ',':
|
||||
case '?':
|
||||
case '?':
|
||||
case '\n':
|
||||
case '\r':
|
||||
insertIntoList(sb, sentences);
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
}
|
||||
|
||||
if (sb.length() > 0) {
|
||||
insertIntoList(sb, sentences);
|
||||
}
|
||||
|
||||
return sentences;
|
||||
}
|
||||
|
||||
private void insertIntoList(StringBuilder sb, List<Sentence> sentences) {
|
||||
String content = sb.toString().trim();
|
||||
if (content.length() > 0) {
|
||||
sentences.add(new Sentence(content));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* 句子对象
|
||||
*/
|
||||
public class Sentence {
|
||||
String value;
|
||||
private double score;
|
||||
|
||||
private MapCount<String> mc = new MapCount<>();
|
||||
|
||||
public Sentence(String value) {
|
||||
this.value = value.trim();
|
||||
}
|
||||
|
||||
public void updateScore(String name, double score) {
|
||||
mc.add(name);
|
||||
Double size = mc.get().get(name);
|
||||
this.score += score / size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.summary;
|
||||
|
||||
import org.ansj.app.keyword.Keyword;
|
||||
import org.ansj.app.summary.pojo.Summary;
|
||||
import org.nlpcn.commons.lang.tire.SmartGetWord;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 关键字标红,
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class TagContent {
|
||||
|
||||
private String beginTag, endTag;
|
||||
|
||||
public TagContent(String beginTag, String endTag) {
|
||||
this.beginTag = beginTag;
|
||||
this.endTag = endTag;
|
||||
}
|
||||
|
||||
public String tagContent(Summary summary) {
|
||||
return tagContent(summary.getKeyWords(), summary.getSummary());
|
||||
}
|
||||
|
||||
public String tagContent(List<Keyword> keyWords, String content) {
|
||||
SmartForest<Double> sf = new SmartForest<>();
|
||||
for (Keyword keyWord : keyWords) {
|
||||
sf.add(keyWord.getName().toLowerCase(), keyWord.getScore());
|
||||
}
|
||||
|
||||
SmartGetWord<Double> sgw = new SmartGetWord<>(sf, content.toLowerCase());
|
||||
|
||||
int beginOffe = 0;
|
||||
String temp = null;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while ((temp = sgw.getFrontWords()) != null) {
|
||||
sb.append(content.substring(beginOffe, sgw.offe));
|
||||
sb.append(beginTag);
|
||||
sb.append(content.substring(sgw.offe, sgw.offe + temp.length()));
|
||||
sb.append(endTag);
|
||||
beginOffe = sgw.offe + temp.length();
|
||||
}
|
||||
|
||||
if (beginOffe <= content.length() - 1) {
|
||||
sb.append(content.substring(beginOffe, content.length()));
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.app.summary.pojo;
|
||||
|
||||
import org.ansj.app.keyword.Keyword;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 摘要结构体封装
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class Summary {
|
||||
|
||||
/**
|
||||
* 关键词
|
||||
*/
|
||||
private List<Keyword> keyWords = null;
|
||||
|
||||
/**
|
||||
* 摘要
|
||||
*/
|
||||
private String summary;
|
||||
|
||||
public Summary(List<Keyword> keyWords, String summary) {
|
||||
this.keyWords = keyWords;
|
||||
this.summary = summary;
|
||||
}
|
||||
|
||||
public List<Keyword> getKeyWords() {
|
||||
return keyWords;
|
||||
}
|
||||
|
||||
public String getSummary() {
|
||||
return summary;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic;
|
||||
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
/**
|
||||
* 加载词典用的类
|
||||
*
|
||||
* @author ansj
|
||||
*/
|
||||
public class DicReader {
|
||||
|
||||
private static final Log logger = LogFactory.getLog();
|
||||
|
||||
public static BufferedReader getReader(String name) {
|
||||
// maven工程修改词典加载方式
|
||||
InputStream in = DicReader.class.getResourceAsStream("/" + name);
|
||||
try {
|
||||
return new BufferedReader(new InputStreamReader(in, "UTF-8"));
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
logger.warn("不支持的编码", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static InputStream getInputStream(String name) {
|
||||
// maven工程修改词典加载方式
|
||||
InputStream in = DicReader.class.getResourceAsStream("/" + name);
|
||||
return in;
|
||||
}
|
||||
}
|
|
@ -1,207 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic;
|
||||
|
||||
import org.ansj.app.crf.SplitWord;
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.NewWord;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
|
||||
import org.ansj.recognition.impl.NatureRecognition;
|
||||
import org.ansj.util.Graph;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.CollectionUtil;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* 新词发现,这是个线程安全的.所以可以多个对象公用一个
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class LearnTool {
|
||||
|
||||
private SplitWord splitWord = null;
|
||||
|
||||
/**
|
||||
* 是否开启学习机
|
||||
*/
|
||||
public boolean isAsianName = true;
|
||||
|
||||
public boolean isForeignName = true;
|
||||
|
||||
/**
|
||||
* 告诉大家你学习了多少个词了
|
||||
*/
|
||||
public int count;
|
||||
|
||||
/**
|
||||
* 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
|
||||
*/
|
||||
private final SmartForest<NewWord> sf = new SmartForest<>();
|
||||
|
||||
/**
|
||||
* 学习新词排除用户自定义词典那中的词语
|
||||
*/
|
||||
private Forest[] forests;
|
||||
|
||||
/**
|
||||
* 公司名称学习.
|
||||
*
|
||||
* @param graph
|
||||
*/
|
||||
public void learn(Graph graph, SplitWord splitWord, Forest... forests) {
|
||||
|
||||
this.splitWord = splitWord;
|
||||
|
||||
this.forests = forests;
|
||||
|
||||
// 亚洲人名识别
|
||||
if (isAsianName) {
|
||||
findAsianPerson(graph);
|
||||
}
|
||||
|
||||
// 外国人名识别
|
||||
if (isForeignName) {
|
||||
findForeignPerson(graph);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void findAsianPerson(Graph graph) {
|
||||
List<NewWord> newWords = new AsianPersonRecognition().getNewWords(graph.terms);
|
||||
addListToTerm(newWords);
|
||||
}
|
||||
|
||||
private void findForeignPerson(Graph graph) {
|
||||
List<NewWord> newWords = new ForeignPersonRecognition().getNewWords(graph.terms);
|
||||
addListToTerm(newWords);
|
||||
}
|
||||
|
||||
// 批量将新词加入到词典中
|
||||
private void addListToTerm(List<NewWord> newWords) {
|
||||
if (newWords.isEmpty())
|
||||
return;
|
||||
for (NewWord newWord : newWords) {
|
||||
|
||||
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(newWord.getName());
|
||||
|
||||
if (termNatures == TermNatures.NULL) {
|
||||
addTerm(newWord);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加一个新词到树中
|
||||
*
|
||||
* @param newWord
|
||||
*/
|
||||
public void addTerm(NewWord newWord) {
|
||||
NewWord temp = null;
|
||||
SmartForest<NewWord> smartForest = null;
|
||||
if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
|
||||
temp = smartForest.getParam();
|
||||
temp.update(newWord.getNature(), newWord.getAllFreq());
|
||||
} else {
|
||||
count++;
|
||||
if (splitWord == null) {
|
||||
newWord.setScore(-1);
|
||||
} else {
|
||||
newWord.setScore(-splitWord.cohesion(newWord.getName()));
|
||||
}
|
||||
|
||||
synchronized (sf) {
|
||||
sf.add(newWord.getName(), newWord);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public SmartForest<NewWord> getForest() {
|
||||
return this.sf;
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回学习到的新词.
|
||||
*
|
||||
* @param num 返回数目.0为全部返回
|
||||
* @return
|
||||
*/
|
||||
public List<Entry<String, Double>> getTopTree(int num) {
|
||||
return getTopTree(num, null);
|
||||
}
|
||||
|
||||
public List<Entry<String, Double>> getTopTree(int num, Nature nature) {
|
||||
if (sf.branches == null) {
|
||||
return null;
|
||||
}
|
||||
HashMap<String, Double> hm = new HashMap<>();
|
||||
for (int i = 0; i < sf.branches.length; i++) {
|
||||
valueResult(sf.branches[i], hm, nature);
|
||||
}
|
||||
List<Entry<String, Double>> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
|
||||
if (num == 0) {
|
||||
return sortMapByValue;
|
||||
} else {
|
||||
num = Math.min(num, sortMapByValue.size());
|
||||
return sortMapByValue.subList(0, num);
|
||||
}
|
||||
}
|
||||
|
||||
private void valueResult(SmartForest<NewWord> smartForest, HashMap<String, Double> hm, Nature nature) {
|
||||
|
||||
if (smartForest == null || smartForest.branches == null) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < smartForest.branches.length; i++) {
|
||||
NewWord param = smartForest.branches[i].getParam();
|
||||
if (smartForest.branches[i].getStatus() == 3) {
|
||||
if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
|
||||
hm.put(param.getName(), param.getScore());
|
||||
}
|
||||
} else if (smartForest.branches[i].getStatus() == 2) {
|
||||
if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
|
||||
hm.put(param.getName(), param.getScore());
|
||||
}
|
||||
valueResult(smartForest.branches[i], hm, nature);
|
||||
} else {
|
||||
valueResult(smartForest.branches[i], hm, nature);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 尝试激活,新词
|
||||
*
|
||||
* @param name
|
||||
*/
|
||||
public void active(String name) {
|
||||
SmartForest<NewWord> branch = sf.getBranch(name);
|
||||
if (branch != null && branch.getParam() != null) {
|
||||
branch.getParam().setActive(true);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic;
|
||||
|
||||
import org.ansj.dic.impl.File2Stream;
|
||||
import org.ansj.dic.impl.Jar2Stream;
|
||||
import org.ansj.dic.impl.Jdbc2Stream;
|
||||
import org.ansj.dic.impl.Url2Stream;
|
||||
import org.ansj.exception.LibraryException;
|
||||
import org.deeplearning4j.common.config.DL4JClassLoading;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
public abstract class PathToStream {
|
||||
|
||||
public static InputStream stream(String path) {
|
||||
try {
|
||||
if (path.startsWith("file://")) {
|
||||
return new File2Stream().toStream(path);
|
||||
} else if (path.startsWith("jdbc://")) {
|
||||
return new Jdbc2Stream().toStream(path);
|
||||
} else if (path.startsWith("jar://")) {
|
||||
return new Jar2Stream().toStream(path);
|
||||
} else if (path.startsWith("class://")) {
|
||||
// Probably unused
|
||||
return loadClass(path);
|
||||
} else if (path.startsWith("http://") || path.startsWith("https://")) {
|
||||
return new Url2Stream().toStream(path);
|
||||
} else {
|
||||
return new File2Stream().toStream(path);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new LibraryException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public abstract InputStream toStream(String path);
|
||||
|
||||
static InputStream loadClass(String path) {
|
||||
String className = path
|
||||
.substring("class://".length())
|
||||
.split("\\|")[0];
|
||||
|
||||
return DL4JClassLoading
|
||||
.createNewInstance(className, PathToStream.class)
|
||||
.toStream(path);
|
||||
}
|
||||
}
|
|
@ -1,103 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic.impl;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.exception.LibraryException;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Vector;
|
||||
|
||||
public class File2Stream extends PathToStream {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(File2Stream.class);
|
||||
|
||||
@Override
|
||||
public InputStream toStream(String path) {
|
||||
LOG.info("path to stream " + path);
|
||||
|
||||
if (path.startsWith("file://")) {
|
||||
path = path.substring(7);
|
||||
}
|
||||
|
||||
File file = new File(path);
|
||||
|
||||
if (file.exists() && file.canRead()) {
|
||||
|
||||
try {
|
||||
if (file.isDirectory()) {
|
||||
return multiple(path);
|
||||
} else {
|
||||
return new FileInputStream(file);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new LibraryException(e);
|
||||
}
|
||||
}
|
||||
throw new LibraryException(
|
||||
" path :" + path + " file:" + file.getAbsolutePath() + " not found or can not to read");
|
||||
|
||||
}
|
||||
|
||||
private InputStream multiple(String path) throws FileNotFoundException {
|
||||
File[] libs = new File[0];
|
||||
|
||||
File file = new File(path);
|
||||
|
||||
if (file.exists() && file.canRead()) {
|
||||
if (file.isFile()) {
|
||||
libs = new File[1];
|
||||
libs[0] = file;
|
||||
} else if (file.isDirectory()) {
|
||||
|
||||
File[] files = file.listFiles(new FileFilter() {
|
||||
@Override
|
||||
public boolean accept(File file) {
|
||||
return file.canRead() && !file.isHidden() && !file.isDirectory();
|
||||
}
|
||||
});
|
||||
|
||||
if (files != null && files.length > 0) {
|
||||
libs = files;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (libs.length == 0) {
|
||||
throw new LibraryException("not find any file in path : " + path);
|
||||
}
|
||||
|
||||
if (libs.length == 1) {
|
||||
return new FileInputStream(libs[0]);
|
||||
}
|
||||
|
||||
Vector<InputStream> vector = new Vector<>(libs.length);
|
||||
|
||||
for (int i = 0; i < libs.length; i++) {
|
||||
vector.add(new FileInputStream(libs[i]));
|
||||
}
|
||||
|
||||
return new SequenceInputStream(vector.elements());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic.impl;
|
||||
|
||||
import org.ansj.dic.DicReader;
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.exception.LibraryException;
|
||||
import org.deeplearning4j.common.config.DL4JClassLoading;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
public class Jar2Stream extends PathToStream {
|
||||
|
||||
@Override
|
||||
public InputStream toStream(String path) {
|
||||
if (path.contains("|")) {
|
||||
String[] tokens = path.split("\\|");
|
||||
String className = tokens[0].substring(6);
|
||||
String resourceName = tokens[1].trim();
|
||||
|
||||
Class<Object> resourceClass = DL4JClassLoading.loadClassByName(className);
|
||||
if (resourceClass == null) {
|
||||
throw new LibraryException(String.format("Class '%s' was not found.", className));
|
||||
}
|
||||
|
||||
return resourceClass.getResourceAsStream(resourceName);
|
||||
} else {
|
||||
return DicReader.getInputStream(path.substring(6));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,115 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic.impl;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.exception.LibraryException;
|
||||
import org.deeplearning4j.common.config.DL4JClassLoading;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
|
||||
public class Jdbc2Stream extends PathToStream {
|
||||
|
||||
private static final byte[] TAB = "\t".getBytes();
|
||||
|
||||
private static final byte[] LINE = "\n".getBytes();
|
||||
|
||||
private static final String[] JDBC_DRIVERS = {
|
||||
"org.h2.Driver",
|
||||
"com.ibm.db2.jcc.DB2Driver",
|
||||
"org.hsqldb.jdbcDriver",
|
||||
"org.gjt.mm.mysql.Driver",
|
||||
"oracle.jdbc.OracleDriver",
|
||||
"org.postgresql.Driver",
|
||||
"net.sourceforge.jtds.jdbc.Driver",
|
||||
"com.microsoft.sqlserver.jdbc.SQLServerDriver",
|
||||
"org.sqlite.JDBC",
|
||||
"com.mysql.jdbc.Driver"
|
||||
};
|
||||
|
||||
static {
|
||||
loadJdbcDrivers();
|
||||
}
|
||||
|
||||
static void loadJdbcDrivers() {
|
||||
for (String driverClassName : JDBC_DRIVERS) {
|
||||
DL4JClassLoading.loadClassByName(driverClassName);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream toStream(String path) {
|
||||
path = path.substring(7);
|
||||
|
||||
String[] split = path.split("\\|");
|
||||
|
||||
String jdbc = split[0];
|
||||
|
||||
String username = split[1];
|
||||
|
||||
String password = split[2];
|
||||
|
||||
String sqlStr = split[3];
|
||||
|
||||
String logStr = jdbc + "|" + username + "|********|" + sqlStr;
|
||||
|
||||
try (Connection conn = DriverManager.getConnection(jdbc, username, password);
|
||||
PreparedStatement statement = conn.prepareStatement(sqlStr);
|
||||
ResultSet rs = statement.executeQuery();
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(100 * 1024)) {
|
||||
|
||||
int i, count;
|
||||
while (rs.next()) {
|
||||
for (i = 1, count = rs.getMetaData().getColumnCount(); i < count; ++i) {
|
||||
baos.write(String.valueOf(rs.getObject(i)).getBytes());
|
||||
baos.write(TAB);
|
||||
}
|
||||
baos.write(String.valueOf(rs.getObject(count)).getBytes());
|
||||
baos.write(LINE);
|
||||
}
|
||||
|
||||
return new ByteArrayInputStream(baos.toByteArray());
|
||||
} catch (Exception e) {
|
||||
throw new LibraryException("err to load by jdbc " + logStr);
|
||||
}
|
||||
}
|
||||
|
||||
public static String encryption(String path) {
|
||||
|
||||
String[] split = path.split("\\|");
|
||||
|
||||
String jdbc = split[0];
|
||||
|
||||
String username = split[1];
|
||||
|
||||
String password = split[2];
|
||||
|
||||
String sqlStr = split[3];
|
||||
|
||||
return jdbc + "|" + username + "|********|" + sqlStr;
|
||||
}
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.dic.impl;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.exception.LibraryException;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
|
||||
public class Url2Stream extends PathToStream {
|
||||
|
||||
@Override
|
||||
public InputStream toStream(String path) {
|
||||
try {
|
||||
URL url = new URL(path);
|
||||
return url.openStream();
|
||||
} catch (Exception e) {
|
||||
throw new LibraryException("err to load by http " + path + " message : " + e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,82 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import org.nlpcn.commons.lang.dat.Item;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnsjItem extends Item implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public static final AnsjItem NULL = new AnsjItem();
|
||||
|
||||
public static final AnsjItem BEGIN = new AnsjItem();
|
||||
|
||||
public static final AnsjItem END = new AnsjItem();
|
||||
|
||||
static {
|
||||
NULL.base = 0;
|
||||
|
||||
BEGIN.index = 0;
|
||||
BEGIN.termNatures = TermNatures.BEGIN;
|
||||
|
||||
END.index = -1;
|
||||
END.termNatures = TermNatures.END;
|
||||
}
|
||||
|
||||
public String param;
|
||||
|
||||
/**
|
||||
* frequency : 词性词典,以及词性的相关权重
|
||||
*/
|
||||
public TermNatures termNatures = null;
|
||||
|
||||
public Map<Integer, Integer> bigramEntryMap = null;
|
||||
|
||||
@Override
|
||||
public void init(String[] split) {
|
||||
this.name = split[0];
|
||||
this.param = split[1];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initValue(String[] split) {
|
||||
index = Integer.parseInt(split[0]);
|
||||
base = Integer.parseInt(split[2]);
|
||||
check = Integer.parseInt(split[3]);
|
||||
status = Byte.parseByte(split[4]);
|
||||
if (status > 1) {
|
||||
name = split[1];
|
||||
termNatures = new TermNatures(TermNature.setNatureStrToArray(split[5]), index);
|
||||
} else {
|
||||
termNatures = new TermNatures(TermNature.NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toText() {
|
||||
return index + "\t" + name + "\t" + base + "\t" + check + "\t" + status + "\t" + param;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
public class KV<K, V> {
|
||||
|
||||
private K k;
|
||||
|
||||
private V v;
|
||||
|
||||
private KV(K k, V v) {
|
||||
this.k = k;
|
||||
this.v = v;
|
||||
}
|
||||
|
||||
public static <K, V> KV<K, V> with(K k, V v) {
|
||||
return new KV<>(k, v);
|
||||
}
|
||||
|
||||
public void setK(K k) {
|
||||
this.k = k;
|
||||
}
|
||||
|
||||
public void setV(V v) {
|
||||
this.v = v;
|
||||
}
|
||||
|
||||
public K getK() {
|
||||
return k;
|
||||
}
|
||||
|
||||
public V getV() {
|
||||
return v;
|
||||
}
|
||||
}
|
|
@ -1,73 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import org.ansj.library.NatureLibrary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 这里面封装了一些基本的词性.
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class Nature implements Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -1427092012930357598L;
|
||||
// 词性的名称
|
||||
public final String natureStr;
|
||||
// 词性对照表的位置
|
||||
public final int index;
|
||||
// 词性的下标值
|
||||
public final int natureIndex;
|
||||
// 词性的频率
|
||||
public final int allFrequency;
|
||||
|
||||
public static final Nature NW = NatureLibrary.getNature("nw");
|
||||
|
||||
public static final Nature NRF = NatureLibrary.getNature("nrf");
|
||||
|
||||
public static final Nature NR = NatureLibrary.getNature("nr");
|
||||
|
||||
public static final Nature NULL = NatureLibrary.getNature("null");
|
||||
|
||||
public Nature(String natureStr, int index, int natureIndex, int allFrequency) {
|
||||
this.natureStr = natureStr;
|
||||
this.index = index;
|
||||
this.natureIndex = natureIndex;
|
||||
this.allFrequency = allFrequency;
|
||||
}
|
||||
|
||||
public Nature(String natureStr) {
|
||||
this.natureStr = natureStr;
|
||||
this.index = 0;
|
||||
this.natureIndex = 0;
|
||||
this.allFrequency = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return natureStr + ":" + index + ":" + natureIndex;
|
||||
}
|
||||
}
|
|
@ -1,116 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 新词发现,实体名
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class NewWord implements Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 7226797287286838356L;
|
||||
// 名字
|
||||
private String name;
|
||||
// 分数
|
||||
private double score;
|
||||
// 词性
|
||||
private Nature nature;
|
||||
// 总词频
|
||||
private int allFreq;
|
||||
// 此词是否被激活
|
||||
private boolean isActive;
|
||||
|
||||
public NewWord(String name, Nature nature, double score) {
|
||||
this.name = name;
|
||||
this.nature = nature;
|
||||
this.score = score;
|
||||
this.allFreq = 1;
|
||||
}
|
||||
|
||||
public NewWord(String name, Nature nature) {
|
||||
this.name = name;
|
||||
this.nature = nature;
|
||||
this.allFreq = 1;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public Nature getNature() {
|
||||
return nature;
|
||||
}
|
||||
|
||||
public void setNature(Nature nature) {
|
||||
this.nature = nature;
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新发现权重,并且更新词性
|
||||
*
|
||||
* @param version
|
||||
* @param i
|
||||
* @param tn
|
||||
*/
|
||||
public void update(Nature nature, int freq) {
|
||||
this.score += score * freq;
|
||||
this.allFreq += freq;
|
||||
if (Nature.NW != nature) {
|
||||
this.nature = nature;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.name + "\t" + this.score + "\t" + this.getNature().natureStr;
|
||||
}
|
||||
|
||||
public int getAllFreq() {
|
||||
return allFreq;
|
||||
}
|
||||
|
||||
public void setScore(double score) {
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public boolean isActive() {
|
||||
return isActive;
|
||||
}
|
||||
|
||||
public void setActive(boolean isActive) {
|
||||
this.isActive = isActive;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class NumNatureAttr implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public static final NumNatureAttr NULL = new NumNatureAttr();
|
||||
|
||||
// 是有可能是一个数字
|
||||
public int numFreq = -1;
|
||||
|
||||
// 数字的结尾
|
||||
public int numEndFreq = -1;
|
||||
|
||||
// 最大词性是否是数字
|
||||
public boolean flag = false;
|
||||
|
||||
public NumNatureAttr() {}
|
||||
}
|
|
@ -1,129 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 人名标注pojo类
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class PersonNatureAttr implements Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -8443825231800208197L;
|
||||
|
||||
// public int B = -1;//0 姓氏
|
||||
// public int C = -1;//1 双名的首字
|
||||
// public int D = -1;//2 双名的末字
|
||||
// public int E = -1;//3 单名
|
||||
// public int N = -1; //4任意字
|
||||
// public int L = -1;//11 人名的下文
|
||||
// public int M = -1;//12 两个中国人名之间的成分
|
||||
// public int m = -1;//44 可拆分的姓名
|
||||
// String[] parretn = {"BC", "BCD", "BCDE", "BCDEN"}
|
||||
// double[] factory = {"BC", "BCD", "BCDE", "BCDEN"}
|
||||
|
||||
public static final PersonNatureAttr NULL = new PersonNatureAttr();
|
||||
|
||||
private int[][] locFreq = null;
|
||||
|
||||
public int split;
|
||||
// 12
|
||||
public int begin;
|
||||
// 11+12
|
||||
public int end;
|
||||
|
||||
public int allFreq;
|
||||
|
||||
// 是否有可能是名字的第一个字
|
||||
public boolean flag;
|
||||
|
||||
/**
|
||||
* 设置
|
||||
*
|
||||
* @param index
|
||||
* @param freq
|
||||
*/
|
||||
public void addFreq(int index, int freq) {
|
||||
switch (index) {
|
||||
case 11:
|
||||
this.end += freq;
|
||||
allFreq += freq;
|
||||
break;
|
||||
case 12:
|
||||
this.end += freq;
|
||||
this.begin += freq;
|
||||
allFreq += freq;
|
||||
break;
|
||||
case 44:
|
||||
this.split += freq;
|
||||
allFreq += freq;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 得道某一个位置的词频
|
||||
*
|
||||
* @param length
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
public int getFreq(int length, int loc) {
|
||||
if (locFreq == null)
|
||||
return 0;
|
||||
if (length > 3)
|
||||
length = 3;
|
||||
if (loc > 4)
|
||||
loc = 4;
|
||||
return locFreq[length][loc];
|
||||
}
|
||||
|
||||
/**
|
||||
* 词频记录表
|
||||
*
|
||||
* @param ints
|
||||
*/
|
||||
public void setlocFreq(int[][] ints) {
|
||||
for (int i = 0; i < ints.length; i++) {
|
||||
if (ints[i][0] > 0) {
|
||||
flag = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
locFreq = ints;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("begin=" + begin);
|
||||
sb.append(",");
|
||||
sb.append("end=" + end);
|
||||
sb.append(",");
|
||||
sb.append("split=" + split);
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
|
@ -1,114 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import org.ansj.recognition.Recognition;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 分词结果的一个封装
|
||||
*
|
||||
* @author Ansj
|
||||
*
|
||||
*/
|
||||
public class Result implements Iterable<Term> {
|
||||
|
||||
private List<Term> terms = null;
|
||||
|
||||
public Result(List<Term> terms) {
|
||||
this.terms = terms;
|
||||
}
|
||||
|
||||
public List<Term> getTerms() {
|
||||
return terms;
|
||||
}
|
||||
|
||||
public void setTerms(List<Term> terms) {
|
||||
this.terms = terms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Term> iterator() {
|
||||
return terms.iterator();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return terms.size();
|
||||
}
|
||||
|
||||
public Term get(int index) {
|
||||
return terms.get(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* 调用一个发现引擎
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Result recognition(Recognition re) {
|
||||
re.recognition(this);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return toString(",");
|
||||
}
|
||||
|
||||
|
||||
public String toString(String split) {
|
||||
return StringUtil.joiner(this.terms, split);
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回没有词性的切分结果
|
||||
* @return
|
||||
*/
|
||||
public String toStringWithOutNature() {
|
||||
return toStringWithOutNature(",");
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回没有词性的切分结果
|
||||
* @return
|
||||
*/
|
||||
public String toStringWithOutNature(String split) {
|
||||
|
||||
if (terms == null || terms.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
Iterator<Term> iterator = terms.iterator();
|
||||
|
||||
StringBuilder sb = new StringBuilder(iterator.next().getRealName());
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
sb.append(split);
|
||||
sb.append(iterator.next().getRealName());
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,320 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import org.ansj.util.MathUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class Term implements Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
// 当前词
|
||||
private String name;
|
||||
//
|
||||
private String realName;
|
||||
// 当前词的起始位置
|
||||
private int offe;
|
||||
// 词性列表
|
||||
private TermNatures termNatures = TermNatures.NULL;
|
||||
// 词性列表
|
||||
private AnsjItem item = AnsjItem.NULL;
|
||||
// 同一行内数据
|
||||
private Term next;
|
||||
// 分数
|
||||
private double score = 0;
|
||||
// 本身分数
|
||||
private double selfScore = 1;
|
||||
// 起始位置
|
||||
private Term from;
|
||||
// 到达位置
|
||||
private Term to;
|
||||
// 本身这个term的词性.需要在词性识别之后才会有值,默认是空
|
||||
private Nature nature = Nature.NULL;
|
||||
//是否是一个新词
|
||||
private boolean newWord;
|
||||
//同义词
|
||||
private List<String> synonyms;
|
||||
|
||||
|
||||
private List<Term> subTerm = null;
|
||||
|
||||
public Term(String name, int offe, AnsjItem item) {
|
||||
super();
|
||||
this.name = name;
|
||||
this.offe = offe;
|
||||
this.item = item;
|
||||
if (item.termNatures != null) {
|
||||
this.termNatures = item.termNatures;
|
||||
if (termNatures.nature != null) {
|
||||
this.nature = termNatures.nature;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Term(String name, int offe, TermNatures termNatures) {
|
||||
super();
|
||||
this.name = name;
|
||||
this.offe = offe;
|
||||
this.termNatures = termNatures;
|
||||
if (termNatures.nature != null) {
|
||||
this.nature = termNatures.nature;
|
||||
}
|
||||
}
|
||||
|
||||
public Term(String name, int offe, String natureStr, int natureFreq) {
|
||||
super();
|
||||
this.name = name;
|
||||
this.offe = offe;
|
||||
TermNature termNature = new TermNature(natureStr, natureFreq);
|
||||
this.nature = termNature.nature;
|
||||
this.termNatures = new TermNatures(termNature);
|
||||
}
|
||||
|
||||
// 可以到达的位置
|
||||
public int toValue() {
|
||||
return offe + name.length();
|
||||
}
|
||||
|
||||
public int getOffe() {
|
||||
return offe;
|
||||
}
|
||||
|
||||
public void setOffe(int offe) {
|
||||
this.offe = offe;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心构建最优的路径
|
||||
*
|
||||
* @param term
|
||||
*/
|
||||
public void setPathScore(Term from, Map<String, Double> relationMap) {
|
||||
// 维特比进行最优路径的构建
|
||||
double score = MathUtil.compuScore(from, this, relationMap);
|
||||
if (this.from == null || this.score == 0 || this.score >= score) {
|
||||
this.setFromAndScore(from, score);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心分数的最优的路径,越小越好
|
||||
*
|
||||
* @param term
|
||||
*/
|
||||
public void setPathSelfScore(Term from) {
|
||||
double score = this.selfScore + from.score;
|
||||
// 维特比进行最优路径的构建
|
||||
if (this.from == null || this.score > score) {
|
||||
this.setFromAndScore(from, score);
|
||||
}
|
||||
}
|
||||
|
||||
private void setFromAndScore(Term from, double score) {
|
||||
this.from = from;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
/**
|
||||
* 进行term合并
|
||||
*
|
||||
* @param term
|
||||
* @param maxNature
|
||||
*/
|
||||
public Term merage(Term to) {
|
||||
this.name = this.name + to.getName();
|
||||
if (StringUtil.isNotBlank(this.realName) && StringUtil.isNotBlank(to.getRealName())) {
|
||||
this.realName = this.realName + to.getRealName();
|
||||
}
|
||||
this.setTo(to.to);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 进行term合并,能合并空白字符
|
||||
*
|
||||
* @param term
|
||||
* @param maxNature
|
||||
*/
|
||||
public Term merageWithBlank(Term to) {
|
||||
this.name = this.name + to.getName();
|
||||
this.realName = this.realName + to.getRealName();
|
||||
this.setTo(to.to);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新偏移量
|
||||
*
|
||||
* @param offe
|
||||
*/
|
||||
public void updateOffe(int offe) {
|
||||
this.offe += offe;
|
||||
}
|
||||
|
||||
public Term next() {
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回他自己
|
||||
*
|
||||
* @param next
|
||||
* 设置他的下一个
|
||||
* @return
|
||||
*/
|
||||
public Term setNext(Term next) {
|
||||
this.next = next;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Term from() {
|
||||
return from;
|
||||
}
|
||||
|
||||
public Term to() {
|
||||
return to;
|
||||
}
|
||||
|
||||
public void setFrom(Term from) {
|
||||
this.from = from;
|
||||
}
|
||||
|
||||
public void setTo(Term to) {
|
||||
this.to = to;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得这个term的所有词性
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public TermNatures termNatures() {
|
||||
return termNatures;
|
||||
}
|
||||
|
||||
public void setNature(Nature nature) {
|
||||
this.nature = nature;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得这个词的词性.词性计算后才可生效
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Nature natrue() {
|
||||
return nature;
|
||||
}
|
||||
|
||||
public String getNatureStr() {
|
||||
return nature.natureStr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if ("null".equals(nature.natureStr)) {
|
||||
return this.getRealName();
|
||||
}
|
||||
return this.getRealName() + "/" + nature.natureStr;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将term的所有分数置为0
|
||||
*/
|
||||
public void clearScore() {
|
||||
this.score = 0;
|
||||
this.selfScore = 0;
|
||||
}
|
||||
|
||||
public void setSubTerm(List<Term> subTerm) {
|
||||
this.subTerm = subTerm;
|
||||
}
|
||||
|
||||
public List<Term> getSubTerm() {
|
||||
return subTerm;
|
||||
}
|
||||
|
||||
public String getRealName() {
|
||||
if (realName == null) {
|
||||
return name;
|
||||
}
|
||||
return realName;
|
||||
}
|
||||
|
||||
public void setRealName(String realName) {
|
||||
this.realName = realName;
|
||||
}
|
||||
|
||||
public double score() {
|
||||
return this.score;
|
||||
}
|
||||
|
||||
public void score(double score) {
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public double selfScore() {
|
||||
return this.selfScore;
|
||||
}
|
||||
|
||||
public void selfScore(double selfScore) {
|
||||
this.selfScore = selfScore;
|
||||
}
|
||||
|
||||
public AnsjItem item() {
|
||||
return this.item;
|
||||
}
|
||||
|
||||
public boolean isNewWord() {
|
||||
return newWord;
|
||||
}
|
||||
|
||||
public void setNewWord(boolean newWord) {
|
||||
this.newWord = newWord;
|
||||
}
|
||||
|
||||
public void updateTermNaturesAndNature(TermNatures termNatures) {
|
||||
this.termNatures = termNatures;
|
||||
this.nature = termNatures.nature;
|
||||
}
|
||||
|
||||
public List<String> getSynonyms() {
|
||||
return synonyms;
|
||||
}
|
||||
|
||||
public void setSynonyms(List<String> synonyms) {
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import org.ansj.library.NatureLibrary;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 一个词里面会有一些词性
|
||||
*
|
||||
* @author ansj
|
||||
*/
|
||||
public class TermNature implements Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 5538058744208591381L;
|
||||
/**
|
||||
* 系统内置的几个
|
||||
*/
|
||||
public static final TermNature M = new TermNature("m", 1);
|
||||
public static final TermNature EN = new TermNature("en", 1);
|
||||
public static final TermNature BEGIN = new TermNature("始##始", 1);
|
||||
public static final TermNature END = new TermNature("末##末", 1);
|
||||
public static final TermNature USER_DEFINE = new TermNature("userDefine", 1);
|
||||
public static final TermNature NR = new TermNature("nr", 1);
|
||||
public static final TermNature NT = new TermNature("nt", 1);
|
||||
public static final TermNature NS = new TermNature("ns", 1);
|
||||
public static final TermNature NW = new TermNature("nw", 1);
|
||||
public static final TermNature NRF = new TermNature("nrf", 1);
|
||||
public static final TermNature NULL = new TermNature("null", 1);
|
||||
|
||||
public Nature nature;
|
||||
|
||||
public int frequency;
|
||||
|
||||
public TermNature(String natureStr, int frequency) {
|
||||
this.nature = NatureLibrary.getNature(natureStr);
|
||||
this.frequency = frequency;
|
||||
}
|
||||
|
||||
public static TermNature[] setNatureStrToArray(String natureStr) {
|
||||
|
||||
natureStr = natureStr.substring(1, natureStr.length() - 1);
|
||||
String[] split = natureStr.split(",");
|
||||
String[] strs = null;
|
||||
Integer frequency = null;
|
||||
TermNature[] all = new TermNature[split.length];
|
||||
for (int i = 0; i < split.length; i++) {
|
||||
strs = split[i].split("=");
|
||||
frequency = Integer.parseInt(strs[1]);
|
||||
all[i] = new TermNature(strs[0].trim(), frequency);
|
||||
}
|
||||
return all;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return nature.natureStr + "/" + frequency;
|
||||
}
|
||||
}
|
|
@ -1,160 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.domain;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 每一个term都拥有一个词性集合
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class TermNatures implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public static final TermNatures M = new TermNatures(TermNature.M);
|
||||
|
||||
public static final TermNatures NR = new TermNatures(TermNature.NR);
|
||||
|
||||
public static final TermNatures EN = new TermNatures(TermNature.EN);
|
||||
|
||||
public static final TermNatures END = new TermNatures(TermNature.END, 50610, -1);
|
||||
|
||||
public static final TermNatures BEGIN = new TermNatures(TermNature.BEGIN, 50610, 0);
|
||||
|
||||
public static final TermNatures NT = new TermNatures(TermNature.NT);
|
||||
|
||||
public static final TermNatures NS = new TermNatures(TermNature.NS);
|
||||
|
||||
public static final TermNatures NRF = new TermNatures(TermNature.NRF);
|
||||
|
||||
public static final TermNatures NW = new TermNatures(TermNature.NW);
|
||||
|
||||
public static final TermNatures NULL = new TermNatures(TermNature.NULL);;
|
||||
|
||||
/**
|
||||
* 关于这个term的所有词性
|
||||
*/
|
||||
public TermNature[] termNatures = null;
|
||||
|
||||
/**
|
||||
* 数字属性
|
||||
*/
|
||||
public NumNatureAttr numAttr = NumNatureAttr.NULL;
|
||||
|
||||
/**
|
||||
* 人名词性
|
||||
*/
|
||||
public PersonNatureAttr personAttr = PersonNatureAttr.NULL;
|
||||
|
||||
/**
|
||||
* 默认词性
|
||||
*/
|
||||
public Nature nature = null;
|
||||
|
||||
/**
|
||||
* 所有的词频
|
||||
*/
|
||||
public int allFreq = 0;
|
||||
|
||||
/**
|
||||
* 词的id
|
||||
*/
|
||||
public int id = -2;
|
||||
|
||||
/**
|
||||
* 构造方法.一个词对应这种玩意
|
||||
*
|
||||
* @param termNatures
|
||||
*/
|
||||
public TermNatures(TermNature[] termNatures, int id) {
|
||||
this.id = id;
|
||||
this.termNatures = termNatures;
|
||||
// find maxNature
|
||||
int maxFreq = -1;
|
||||
TermNature termNature = null;
|
||||
for (int i = 0; i < termNatures.length; i++) {
|
||||
if (maxFreq < termNatures[i].frequency) {
|
||||
maxFreq = termNatures[i].frequency;
|
||||
termNature = termNatures[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (termNature != null) {
|
||||
this.nature = termNature.nature;
|
||||
}
|
||||
|
||||
serAttribute();
|
||||
}
|
||||
|
||||
public TermNatures(TermNature termNature) {
|
||||
termNatures = new TermNature[1];
|
||||
this.termNatures[0] = termNature;
|
||||
this.nature = termNature.nature;
|
||||
serAttribute();
|
||||
}
|
||||
|
||||
public TermNatures(TermNature termNature, int allFreq, int id) {
|
||||
this.id = id;
|
||||
termNatures = new TermNature[1];
|
||||
termNature.frequency = allFreq;
|
||||
this.termNatures[0] = termNature;
|
||||
this.allFreq = allFreq;
|
||||
}
|
||||
|
||||
private void serAttribute() {
|
||||
TermNature termNature = null;
|
||||
int max = 0;
|
||||
NumNatureAttr numNatureAttr = null;
|
||||
for (int i = 0; i < termNatures.length; i++) {
|
||||
termNature = termNatures[i];
|
||||
allFreq += termNature.frequency;
|
||||
max = Math.max(max, termNature.frequency);
|
||||
switch (termNature.nature.index) {
|
||||
case 18:
|
||||
if (numNatureAttr == null) {
|
||||
numNatureAttr = new NumNatureAttr();
|
||||
}
|
||||
numNatureAttr.numFreq = termNature.frequency;
|
||||
break;
|
||||
case 29:
|
||||
if (numNatureAttr == null) {
|
||||
numNatureAttr = new NumNatureAttr();
|
||||
}
|
||||
numNatureAttr.numEndFreq = termNature.frequency;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (numNatureAttr != null) {
|
||||
if (max == numNatureAttr.numFreq) {
|
||||
numNatureAttr.flag = true;
|
||||
}
|
||||
this.numAttr = numNatureAttr;
|
||||
}
|
||||
}
|
||||
|
||||
public void setPersonNatureAttr(PersonNatureAttr personAttr) {
|
||||
this.personAttr = personAttr;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.exception;
|
||||
|
||||
public class LibraryException extends RuntimeException {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public LibraryException(Exception e) {
|
||||
super(e);
|
||||
}
|
||||
|
||||
public LibraryException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,233 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.domain.KV;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.Value;
|
||||
import org.nlpcn.commons.lang.tire.library.Library;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
public class AmbiguityLibrary {
|
||||
|
||||
private static final Log LOG = MyStaticValue.getLog(AmbiguityLibrary.class);
|
||||
|
||||
// 同义词典
|
||||
private static final Map<String, KV<String, Forest>> AMBIGUITY = new HashMap<>();
|
||||
|
||||
public static final String DEFAULT = "ambiguity";
|
||||
|
||||
static {
|
||||
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
|
||||
if (entry.getKey().startsWith(DEFAULT)) {
|
||||
put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
putIfAbsent(DEFAULT, "library/ambiguity.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取系统默认词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static Forest get() {
|
||||
if (!AMBIGUITY.containsKey(DEFAULT)) {
|
||||
return null;
|
||||
}
|
||||
return get(DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据key获取
|
||||
*
|
||||
*/
|
||||
public static Forest get(String key) {
|
||||
|
||||
KV<String, Forest> kv = AMBIGUITY.get(key);
|
||||
|
||||
if (kv == null) {
|
||||
if (MyStaticValue.ENV.containsKey(key)) {
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
return get(key);
|
||||
}
|
||||
|
||||
LOG.warn("crf " + key + " not found in config ");
|
||||
return null;
|
||||
}
|
||||
|
||||
Forest sw = kv.getV();
|
||||
if (sw == null) {
|
||||
try {
|
||||
sw = init(key, kv, false);
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
return sw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private static synchronized Forest init(String key, KV<String, Forest> kv, boolean reload) {
|
||||
Forest forest = kv.getV();
|
||||
if (forest != null) {
|
||||
if (reload) {
|
||||
forest.clear();
|
||||
} else {
|
||||
return forest;
|
||||
}
|
||||
} else {
|
||||
forest = new Forest();
|
||||
}
|
||||
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "utf-8")) {
|
||||
String temp;
|
||||
LOG.debug("begin init ambiguity");
|
||||
long start = System.currentTimeMillis();
|
||||
while ((temp = br.readLine()) != null) {
|
||||
if (StringUtil.isNotBlank(temp)) {
|
||||
temp = StringUtil.trim(temp);
|
||||
String[] split = temp.split("\t");
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (split.length % 2 != 0) {
|
||||
LOG.error("init ambiguity error in line :" + temp + " format err !");
|
||||
continue;
|
||||
}
|
||||
for (int i = 0; i < split.length; i += 2) {
|
||||
sb.append(split[i]);
|
||||
}
|
||||
forest.addBranch(sb.toString(), split);
|
||||
}
|
||||
}
|
||||
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
|
||||
kv.setV(forest);
|
||||
return forest;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getK());
|
||||
AMBIGUITY.remove(key);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 插入到树中呀
|
||||
*
|
||||
* @param key
|
||||
* @param split
|
||||
* @return
|
||||
*/
|
||||
public static void insert(String key, String... split) {
|
||||
Forest forest = get(key);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (split.length % 2 != 0) {
|
||||
LOG.error("init ambiguity error in line :" + Arrays.toString(split) + " format err !");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < split.length; i += 2) {
|
||||
sb.append(split[i]);
|
||||
}
|
||||
forest.addBranch(sb.toString(), split);
|
||||
}
|
||||
|
||||
/**
|
||||
* 插入到树种
|
||||
*
|
||||
* @param key
|
||||
* @param value
|
||||
*/
|
||||
public static void insert(String key, Value value) {
|
||||
Forest forest = get(key);
|
||||
Library.insertWord(forest, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static void put(String key, String path) {
|
||||
put(key, path, null);
|
||||
}
|
||||
|
||||
public static void put(String key, String path, Forest value) {
|
||||
AMBIGUITY.put(key, KV.with(path, value));
|
||||
MyStaticValue.ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除一个key
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static KV<String, Forest> remove(String key) {
|
||||
KV<String, Forest> kv = AMBIGUITY.get(key);
|
||||
if (kv != null && kv.getV() != null) {
|
||||
kv.getV().clear();
|
||||
}
|
||||
MyStaticValue.ENV.remove(key);
|
||||
return AMBIGUITY.remove(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* 刷新一个,将值设置为null
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static void reload(String key) {
|
||||
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
|
||||
remove(key);
|
||||
}
|
||||
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
|
||||
KV<String, Forest> kv = AMBIGUITY.get(key);
|
||||
|
||||
init(key, kv, true);
|
||||
}
|
||||
|
||||
public static Set<String> keys() {
|
||||
return AMBIGUITY.keySet();
|
||||
}
|
||||
|
||||
public static void putIfAbsent(String key, String path) {
|
||||
if (!AMBIGUITY.containsKey(key)) {
|
||||
AMBIGUITY.put(key, KV.with(path, (Forest) null));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,163 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.app.crf.Model;
|
||||
import org.ansj.app.crf.SplitWord;
|
||||
import org.ansj.app.crf.model.CRFModel;
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.domain.KV;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
public class CrfLibrary {
|
||||
|
||||
private static final Log LOG = MyStaticValue.getLog(CrfLibrary.class);
|
||||
|
||||
// CRF模型
|
||||
private static final Map<String, KV<String, SplitWord>> CRF = new HashMap<>();
|
||||
|
||||
public static final String DEFAULT = "crf";
|
||||
|
||||
static {
|
||||
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
|
||||
if (entry.getKey().startsWith(DEFAULT)) {
|
||||
put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
putIfAbsent(DEFAULT, "jar://crf.model");
|
||||
}
|
||||
|
||||
public static SplitWord get() {
|
||||
return get(DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据key获取crf分词器
|
||||
*
|
||||
* @param key
|
||||
* @return crf分词器
|
||||
*/
|
||||
public static SplitWord get(String key) {
|
||||
|
||||
KV<String, SplitWord> kv = CRF.get(key);
|
||||
|
||||
if (kv == null) {
|
||||
if (MyStaticValue.ENV.containsKey(key)) {
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
return get(key);
|
||||
}
|
||||
LOG.warn("crf " + key + " not found in config ");
|
||||
return null;
|
||||
}
|
||||
|
||||
SplitWord sw = kv.getV();
|
||||
if (sw == null) {
|
||||
sw = initCRFModel(kv);
|
||||
}
|
||||
return sw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载CRF模型
|
||||
*
|
||||
* @param modelPath
|
||||
* @return
|
||||
*/
|
||||
private static synchronized SplitWord initCRFModel(KV<String, SplitWord> kv) {
|
||||
try {
|
||||
if (kv.getV() != null) {
|
||||
return kv.getV();
|
||||
}
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
LOG.debug("begin init crf model!");
|
||||
try (InputStream is = PathToStream.stream(kv.getK())) {
|
||||
SplitWord crfSplitWord = new SplitWord(Model.load(CRFModel.class, is));
|
||||
kv.setV(crfSplitWord);
|
||||
LOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
|
||||
return crfSplitWord;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.error(kv + " load err " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static void put(String key, String path) {
|
||||
|
||||
put(key, path, null);
|
||||
}
|
||||
|
||||
public static void put(String key, String path, SplitWord sw) {
|
||||
CRF.put(key, KV.with(path, sw));
|
||||
MyStaticValue.ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除一个key
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static KV<String, SplitWord> remove(String key) {
|
||||
MyStaticValue.ENV.remove(key);
|
||||
return CRF.remove(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* 刷新一个,将值设置为null
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static void reload(String key) {
|
||||
KV<String, SplitWord> kv = CRF.get(key);
|
||||
if (kv != null) {
|
||||
CRF.get(key).setV(null);
|
||||
}
|
||||
|
||||
LOG.warn("make sure ,this reload not use same obj , it to instance a new model");
|
||||
}
|
||||
|
||||
public static Set<String> keys() {
|
||||
return CRF.keySet();
|
||||
}
|
||||
|
||||
public static void putIfAbsent(String key, String path) {
|
||||
if (!CRF.containsKey(key)) {
|
||||
CRF.put(key, KV.with(path, (SplitWord) null));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,167 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.dic.DicReader;
|
||||
import org.ansj.domain.AnsjItem;
|
||||
import org.ansj.domain.PersonNatureAttr;
|
||||
import org.ansj.domain.TermNature;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.library.name.PersonAttrLibrary;
|
||||
import org.nlpcn.commons.lang.dat.DoubleArrayTire;
|
||||
import org.nlpcn.commons.lang.dat.Item;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
public class DATDictionary {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(DATDictionary.class);
|
||||
|
||||
/**
|
||||
* 核心词典
|
||||
*/
|
||||
private static final DoubleArrayTire DAT = loadDAT();
|
||||
|
||||
/**
|
||||
* 数组长度
|
||||
*/
|
||||
public static int arrayLength = DAT.arrayLength;
|
||||
|
||||
/**
|
||||
* 加载词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private static DoubleArrayTire loadDAT() {
|
||||
long start = System.currentTimeMillis();
|
||||
try {
|
||||
DoubleArrayTire dat = DoubleArrayTire.loadText(DicReader.getInputStream("core.dic"), AnsjItem.class);
|
||||
// 人名识别必备的
|
||||
personNameFull(dat);
|
||||
// 记录词典中的词语,并且清除部分数据
|
||||
for (Item item : dat.getDAT()) {
|
||||
if (item == null || item.getName() == null) {
|
||||
continue;
|
||||
}
|
||||
if (item.getStatus() < 2) {
|
||||
item.setName(null);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOG.info("init core library ok use time : " + (System.currentTimeMillis() - start));
|
||||
return dat;
|
||||
} catch (InstantiationException e) {
|
||||
LOG.warn("无法实例化", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
LOG.warn("非法访问", e);
|
||||
} catch (NumberFormatException e) {
|
||||
LOG.warn("数字格式异常", e);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("IO异常", e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static void personNameFull(DoubleArrayTire dat) throws NumberFormatException, IOException {
|
||||
HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap();
|
||||
|
||||
AnsjItem ansjItem = null;
|
||||
// 人名词性补录
|
||||
Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet();
|
||||
char c = 0;
|
||||
String temp = null;
|
||||
for (Entry<String, PersonNatureAttr> entry : entrySet) {
|
||||
temp = entry.getKey();
|
||||
|
||||
if (temp.length() == 1 && (ansjItem = (AnsjItem) dat.getDAT()[temp.charAt(0)]) == null) {
|
||||
ansjItem = new AnsjItem();
|
||||
ansjItem.setBase(c);
|
||||
ansjItem.setCheck(-1);
|
||||
ansjItem.setStatus((byte) 3);
|
||||
ansjItem.setName(temp);
|
||||
dat.getDAT()[temp.charAt(0)] = ansjItem;
|
||||
} else {
|
||||
ansjItem = dat.getItem(temp);
|
||||
}
|
||||
|
||||
if (ansjItem == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((ansjItem.termNatures) == null) {
|
||||
if (temp.length() == 1 && temp.charAt(0) < 256) {
|
||||
ansjItem.termNatures = TermNatures.NULL;
|
||||
} else {
|
||||
ansjItem.termNatures = new TermNatures(TermNature.NR);
|
||||
}
|
||||
}
|
||||
ansjItem.termNatures.setPersonNatureAttr(entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
public static int status(char c) {
|
||||
Item item = DAT.getDAT()[c];
|
||||
if (item == null) {
|
||||
return 0;
|
||||
}
|
||||
return item.getStatus();
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断一个词语是否在词典中
|
||||
*
|
||||
* @param word
|
||||
* @return
|
||||
*/
|
||||
public static boolean isInSystemDic(String word) {
|
||||
Item item = DAT.getItem(word);
|
||||
return item != null && item.getStatus() > 1;
|
||||
}
|
||||
|
||||
public static AnsjItem getItem(int index) {
|
||||
AnsjItem item = DAT.getItem(index);
|
||||
if (item == null) {
|
||||
return AnsjItem.NULL;
|
||||
}
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
public static AnsjItem getItem(String str) {
|
||||
AnsjItem item = DAT.getItem(str);
|
||||
if (item == null || item.getStatus() < 2) {
|
||||
return AnsjItem.NULL;
|
||||
}
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
public static int getId(String str) {
|
||||
return DAT.getId(str);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,309 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.domain.KV;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.Value;
|
||||
import org.nlpcn.commons.lang.tire.library.Library;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
public class DicLibrary {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog();
|
||||
|
||||
public static final String DEFAULT = "dic";
|
||||
|
||||
public static final String DEFAULT_NATURE = "userDefine";
|
||||
|
||||
public static final Integer DEFAULT_FREQ = 1000;
|
||||
|
||||
public static final String DEFAULT_FREQ_STR = "1000";
|
||||
|
||||
// 用户自定义词典
|
||||
private static final Map<String, KV<String, Forest>> DIC = new HashMap<>();
|
||||
|
||||
static {
|
||||
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
|
||||
if (entry.getKey().startsWith(DEFAULT)) {
|
||||
put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
putIfAbsent(DEFAULT, "library/default.dic");
|
||||
|
||||
Forest forest = get();
|
||||
if (forest == null) {
|
||||
put(DEFAULT, DEFAULT, new Forest());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 关键词增加
|
||||
*
|
||||
* @param keyword 所要增加的关键词
|
||||
* @param nature 关键词的词性
|
||||
* @param freq 关键词的词频
|
||||
*/
|
||||
public static void insert(String key, String keyword, String nature, int freq) {
|
||||
Forest dic = get(key);
|
||||
String[] paramers = new String[2];
|
||||
paramers[0] = nature;
|
||||
paramers[1] = String.valueOf(freq);
|
||||
Value value = new Value(keyword, paramers);
|
||||
Library.insertWord(dic, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加关键词
|
||||
*
|
||||
* @param keyword
|
||||
*/
|
||||
public static void insert(String key, String keyword) {
|
||||
|
||||
insert(key, keyword, DEFAULT_NATURE, DEFAULT_FREQ);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除关键词
|
||||
*/
|
||||
public static void delete(String key, String word) {
|
||||
|
||||
Forest dic = get(key);
|
||||
if (dic != null) {
|
||||
Library.removeWord(dic, word);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将用户自定义词典清空
|
||||
*/
|
||||
public static void clear(String key) {
|
||||
get(key).clear();
|
||||
}
|
||||
|
||||
public static Forest get() {
|
||||
if (!DIC.containsKey(DEFAULT)) {
|
||||
return null;
|
||||
}
|
||||
return get(DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据模型名称获取crf模型
|
||||
*
|
||||
* @param modelName
|
||||
* @return
|
||||
*/
|
||||
public static Forest get(String key) {
|
||||
|
||||
KV<String, Forest> kv = DIC.get(key);
|
||||
|
||||
if (kv == null) {
|
||||
if (MyStaticValue.ENV.containsKey(key)) {
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
return get(key);
|
||||
}
|
||||
LOG.warn("dic " + key + " not found in config ");
|
||||
return null;
|
||||
}
|
||||
Forest forest = kv.getV();
|
||||
if (forest == null) {
|
||||
forest = init(key, kv, false);
|
||||
}
|
||||
return forest;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据keys获取词典集合
|
||||
*
|
||||
* @param keys
|
||||
* @return
|
||||
*/
|
||||
public static Forest[] gets(String... keys) {
|
||||
Forest[] forests = new Forest[keys.length];
|
||||
for (int i = 0; i < forests.length; i++) {
|
||||
forests[i] = get(keys[i]);
|
||||
}
|
||||
return forests;
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据keys获取词典集合
|
||||
*
|
||||
* @param keys
|
||||
* @return
|
||||
*/
|
||||
public static Forest[] gets(Collection<String> keys) {
|
||||
return gets(keys.toArray(new String[keys.size()]));
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户自定义词典加载
|
||||
*
|
||||
* @param key
|
||||
* @param path
|
||||
* @return
|
||||
*/
|
||||
|
||||
private synchronized static Forest init(String key, KV<String, Forest> kv, boolean reload) {
|
||||
Forest forest = kv.getV();
|
||||
if (forest != null) {
|
||||
if (reload) {
|
||||
forest.clear();
|
||||
} else {
|
||||
return forest;
|
||||
}
|
||||
} else {
|
||||
forest = new Forest();
|
||||
}
|
||||
try {
|
||||
|
||||
LOG.debug("begin init dic !");
|
||||
long start = System.currentTimeMillis();
|
||||
String temp = null;
|
||||
String[] strs = null;
|
||||
Value value = null;
|
||||
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
|
||||
while ((temp = br.readLine()) != null) {
|
||||
if (StringUtil.isNotBlank(temp)) {
|
||||
temp = StringUtil.trim(temp);
|
||||
strs = temp.split("\t");
|
||||
strs[0] = strs[0].toLowerCase();
|
||||
// 如何核心辞典存在那么就放弃
|
||||
if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
|
||||
continue;
|
||||
}
|
||||
if (strs.length != 3) {
|
||||
value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR);
|
||||
} else {
|
||||
value = new Value(strs[0], strs[1], strs[2]);
|
||||
}
|
||||
Library.insertWord(forest, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
|
||||
kv.setV(forest);
|
||||
return forest;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Init dic library error :" + e.getMessage() + ", path: " + kv.getK());
|
||||
DIC.remove(key);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static void put(String key, String path, Forest forest) {
|
||||
DIC.put(key, KV.with(path, forest));
|
||||
MyStaticValue.ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static void putIfAbsent(String key, String path) {
|
||||
|
||||
if (!DIC.containsKey(key)) {
|
||||
DIC.put(key, KV.with(path, (Forest) null));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static void put(String key, String path) {
|
||||
put(key, path, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param <T>
|
||||
* @param <T>
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static synchronized Forest putIfAbsent(String key, String path, Forest forest) {
|
||||
|
||||
KV<String, Forest> kv = DIC.get(key);
|
||||
if (kv != null && kv.getV() != null) {
|
||||
return kv.getV();
|
||||
}
|
||||
put(key, path, forest);
|
||||
return forest;
|
||||
}
|
||||
|
||||
public static KV<String, Forest> remove(String key) {
|
||||
KV<String, Forest> kv = DIC.get(key);
|
||||
if (kv != null && kv.getV() != null) {
|
||||
kv.getV().clear();
|
||||
}
|
||||
MyStaticValue.ENV.remove(key);
|
||||
return DIC.remove(key);
|
||||
}
|
||||
|
||||
public static Set<String> keys() {
|
||||
return DIC.keySet();
|
||||
}
|
||||
|
||||
public static void reload(String key) {
|
||||
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
|
||||
remove(key);
|
||||
}
|
||||
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
|
||||
KV<String, Forest> kv = DIC.get(key);
|
||||
|
||||
init(key, kv, true);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,144 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class NatureLibrary {
|
||||
|
||||
private static final Log logger = LogFactory.getLog(NatureLibrary.class);
|
||||
|
||||
private static final int YI = 1;
|
||||
private static final int FYI = -1;
|
||||
/**
|
||||
* 词性的字符串对照索引位的hashmap(我发现我又效率狂了.不能这样啊)
|
||||
*/
|
||||
private static final HashMap<String, Nature> NATUREMAP = new HashMap<>();
|
||||
|
||||
/**
|
||||
* 词与词之间的关系.对照natureARRAY,natureMap
|
||||
*/
|
||||
private static int[][] NATURETABLE = null;
|
||||
|
||||
/**
|
||||
* 初始化对照表
|
||||
*/
|
||||
static {
|
||||
init();
|
||||
}
|
||||
|
||||
private static void init() {
|
||||
String split = "\t";
|
||||
int maxLength = 0;
|
||||
String temp = null;
|
||||
String[] strs = null;
|
||||
// 加载词对照性表
|
||||
try (BufferedReader reader = MyStaticValue.getNatureMapReader()) {
|
||||
int p0 = 0;
|
||||
int p1 = 0;
|
||||
int p2 = 0;
|
||||
while ((temp = reader.readLine()) != null) {
|
||||
strs = temp.split(split);
|
||||
if (strs.length != 4)
|
||||
continue;
|
||||
|
||||
p0 = Integer.parseInt(strs[0]);
|
||||
p1 = Integer.parseInt(strs[1]);
|
||||
p2 = Integer.parseInt(strs[3]);
|
||||
NATUREMAP.put(strs[2], new Nature(strs[2], p0, p1, p2));
|
||||
maxLength = Math.max(maxLength, p1);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("词性列表加载失败!", e);
|
||||
}
|
||||
// 加载词性关系
|
||||
try (BufferedReader reader = MyStaticValue.getNatureTableReader()) {
|
||||
NATURETABLE = new int[maxLength + 1][maxLength + 1];
|
||||
int j = 0;
|
||||
while ((temp = reader.readLine()) != null) {
|
||||
if (StringUtil.isBlank(temp))
|
||||
continue;
|
||||
strs = temp.split(split);
|
||||
for (int i = 0; i < strs.length; i++) {
|
||||
NATURETABLE[j][i] = Integer.parseInt(strs[i]);
|
||||
}
|
||||
j++;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("加载词性关系失败!", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得两个词性之间的频率
|
||||
*
|
||||
* @param from
|
||||
* @param to
|
||||
* @return
|
||||
*/
|
||||
public static int getTwoNatureFreq(Nature from, Nature to) {
|
||||
if (from.index < 0 || to.index < 0) {
|
||||
return 0;
|
||||
}
|
||||
return NATURETABLE[from.index][to.index];
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得两个term之间的频率
|
||||
*
|
||||
* @param fromTerm
|
||||
* @param toTerm
|
||||
* @return
|
||||
*/
|
||||
public static int getTwoTermFreq(Term fromTerm, Term toTerm) {
|
||||
Nature from = fromTerm.natrue();
|
||||
Nature to = toTerm.natrue();
|
||||
if (from.index < 0 || to.index < 0) {
|
||||
return 0;
|
||||
}
|
||||
return NATURETABLE[from.index][to.index];
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据字符串得道词性.没有就创建一个
|
||||
*
|
||||
* @param natureStr
|
||||
* @return
|
||||
*/
|
||||
public static Nature getNature(String natureStr) {
|
||||
Nature nature = NATUREMAP.get(natureStr);
|
||||
if (nature == null) {
|
||||
nature = new Nature(natureStr, FYI, FYI, YI);
|
||||
NATUREMAP.put(natureStr, nature);
|
||||
return nature;
|
||||
}
|
||||
return nature;
|
||||
}
|
||||
}
|
|
@ -1,59 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* 两个词之间的关联
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class NgramLibrary {
|
||||
static {
|
||||
long start = System.currentTimeMillis();
|
||||
MyStaticValue.initBigramTables();
|
||||
LogFactory.getLog(NgramLibrary.class).info("init ngram ok use time :" + (System.currentTimeMillis() - start));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找两个词与词之间的频率
|
||||
*
|
||||
* @param from
|
||||
* @param to
|
||||
* @return
|
||||
*/
|
||||
public static int getTwoWordFreq(Term from, Term to) {
|
||||
if (from.item().bigramEntryMap == null) {
|
||||
return 0;
|
||||
}
|
||||
Integer freq = from.item().bigramEntryMap.get(to.item().getIndex());
|
||||
if (freq == null) {
|
||||
return 0;
|
||||
} else {
|
||||
return freq;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,271 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.domain.KV;
|
||||
import org.ansj.recognition.impl.StopRecognition;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
public class StopLibrary {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog();
|
||||
|
||||
public static final String DEFAULT = "stop";
|
||||
|
||||
// 用户自定义词典
|
||||
private static final Map<String, KV<String, StopRecognition>> STOP = new HashMap<>();
|
||||
|
||||
static {
|
||||
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
|
||||
if (entry.getKey().startsWith(DEFAULT)) {
|
||||
put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
putIfAbsent(DEFAULT, "library/stop.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 词性过滤
|
||||
*
|
||||
* @param key
|
||||
* @param stopNatures
|
||||
*/
|
||||
public static void insertStopNatures(String key, String... filterNatures) {
|
||||
StopRecognition fr = get(key);
|
||||
fr.insertStopNatures(filterNatures);
|
||||
}
|
||||
|
||||
/**
|
||||
* 正则过滤
|
||||
*
|
||||
* @param key
|
||||
* @param regexes
|
||||
*/
|
||||
public static void insertStopRegexes(String key, String... regexes) {
|
||||
StopRecognition fr = get(key);
|
||||
fr.insertStopRegexes(regexes);
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加停用词
|
||||
*
|
||||
* @param key
|
||||
* @param regexes
|
||||
*/
|
||||
public static void insertStopWords(String key, String... stopWords) {
|
||||
StopRecognition fr = get(key);
|
||||
fr.insertStopWords(stopWords);
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加停用词
|
||||
*
|
||||
* @param key
|
||||
* @param regexes
|
||||
*/
|
||||
public static void insertStopWords(String key, List<String> stopWords) {
|
||||
StopRecognition fr = get(key);
|
||||
fr.insertStopWords(stopWords);
|
||||
}
|
||||
|
||||
public static StopRecognition get() {
|
||||
return get(DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据模型名称获取crf模型
|
||||
*
|
||||
* @param modelName
|
||||
* @return
|
||||
*/
|
||||
public static StopRecognition get(String key) {
|
||||
KV<String, StopRecognition> kv = STOP.get(key);
|
||||
|
||||
if (kv == null) {
|
||||
if (MyStaticValue.ENV.containsKey(key)) {
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
return get(key);
|
||||
}
|
||||
LOG.warn("STOP " + key + " not found in config ");
|
||||
return null;
|
||||
}
|
||||
StopRecognition stopRecognition = kv.getV();
|
||||
if (stopRecognition == null) {
|
||||
stopRecognition = init(key, kv, false);
|
||||
}
|
||||
return stopRecognition;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 用户自定义词典加载
|
||||
*
|
||||
* @param key
|
||||
* @param path
|
||||
* @return
|
||||
*/
|
||||
private synchronized static StopRecognition init(String key, KV<String, StopRecognition> kv, boolean reload) {
|
||||
StopRecognition stopRecognition = kv.getV();
|
||||
|
||||
if (stopRecognition != null) {
|
||||
if (reload) {
|
||||
stopRecognition.clear();
|
||||
} else {
|
||||
return stopRecognition;
|
||||
}
|
||||
} else {
|
||||
stopRecognition = new StopRecognition();
|
||||
}
|
||||
|
||||
try {
|
||||
LOG.debug("begin init FILTER !");
|
||||
long start = System.currentTimeMillis();
|
||||
String temp = null;
|
||||
String[] strs = null;
|
||||
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
|
||||
while ((temp = br.readLine()) != null) {
|
||||
if (StringUtil.isNotBlank(temp)) {
|
||||
temp = StringUtil.trim(temp);
|
||||
strs = temp.split("\t");
|
||||
|
||||
if (strs.length == 1) {
|
||||
stopRecognition.insertStopWords(strs[0]);
|
||||
} else {
|
||||
switch (strs[1]) {
|
||||
case "nature":
|
||||
stopRecognition.insertStopNatures(strs[0]);
|
||||
break;
|
||||
case "regex":
|
||||
stopRecognition.insertStopRegexes(strs[0]);
|
||||
break;
|
||||
default:
|
||||
stopRecognition.insertStopWords(strs[0]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.info("load stop use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
|
||||
kv.setV(stopRecognition);
|
||||
return stopRecognition;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Init Stop library error :" + e.getMessage() + ", path: " + kv.getK());
|
||||
STOP.remove(key);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param FILTERDefault
|
||||
* @param FILTERDefault2
|
||||
* @param FILTER2
|
||||
*/
|
||||
public static void put(String key, String path, StopRecognition stopRecognition) {
|
||||
STOP.put(key, KV.with(path, stopRecognition));
|
||||
MyStaticValue.ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param FILTERDefault
|
||||
* @param FILTERDefault2
|
||||
* @param FILTER2
|
||||
*/
|
||||
public static void putIfAbsent(String key, String path) {
|
||||
if (!STOP.containsKey(key)) {
|
||||
STOP.put(key, KV.with(path, (StopRecognition) null));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param FILTERDefault
|
||||
* @param FILTERDefault2
|
||||
* @param FILTER2
|
||||
*/
|
||||
public static void put(String key, String path) {
|
||||
put(key, path, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加词典
|
||||
*
|
||||
* @param <T>
|
||||
* @param <T>
|
||||
*
|
||||
* @param FILTERDefault
|
||||
* @param FILTERDefault2
|
||||
* @param FILTER2
|
||||
*/
|
||||
public static synchronized StopRecognition putIfAbsent(String key, String path, StopRecognition stopRecognition) {
|
||||
KV<String, StopRecognition> kv = STOP.get(key);
|
||||
if (kv != null && kv.getV() != null) {
|
||||
return kv.getV();
|
||||
}
|
||||
put(key, path, stopRecognition);
|
||||
return stopRecognition;
|
||||
}
|
||||
|
||||
public static KV<String, StopRecognition> remove(String key) {
|
||||
KV<String, StopRecognition> kv = STOP.get(key);
|
||||
if (kv != null && kv.getV() != null) {
|
||||
kv.getV().clear();
|
||||
}
|
||||
MyStaticValue.ENV.remove(key);
|
||||
return STOP.remove(key);
|
||||
}
|
||||
|
||||
public static Set<String> keys() {
|
||||
return STOP.keySet();
|
||||
}
|
||||
|
||||
public static void reload(String key) {
|
||||
|
||||
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
|
||||
remove(key);
|
||||
}
|
||||
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
|
||||
KV<String, StopRecognition> kv = STOP.get(key);
|
||||
|
||||
init(key, kv, true);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,312 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library;
|
||||
|
||||
import org.ansj.dic.PathToStream;
|
||||
import org.ansj.domain.KV;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
public class SynonymsLibrary {
|
||||
|
||||
private static final Log LOG = MyStaticValue.getLog(SynonymsLibrary.class);
|
||||
|
||||
// 同义词典
|
||||
private static final Map<String, KV<String, SmartForest<List<String>>>> SYNONYMS = new HashMap<>();
|
||||
|
||||
public static final String DEFAULT = "synonyms";
|
||||
|
||||
static {
|
||||
for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) {
|
||||
if (entry.getKey().startsWith(DEFAULT)) {
|
||||
put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
putIfAbsent(DEFAULT, "library/synonyms.dic");
|
||||
}
|
||||
|
||||
public static SmartForest<List<String>> get() {
|
||||
return get(DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
*/
|
||||
public static SmartForest<List<String>> get(String key) {
|
||||
KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key);
|
||||
|
||||
if (kv == null) {
|
||||
if (MyStaticValue.ENV.containsKey(key)) {
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
return get(key);
|
||||
}
|
||||
LOG.warn("crf " + key + " not found in config ");
|
||||
return null;
|
||||
}
|
||||
|
||||
SmartForest<List<String>> sw = kv.getV();
|
||||
if (sw == null) {
|
||||
sw = init(key, kv, false);
|
||||
}
|
||||
return sw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 加载词典
|
||||
*
|
||||
* @param key
|
||||
* @param kv
|
||||
* @param reload 是否更新词典
|
||||
* @return
|
||||
*/
|
||||
private static synchronized SmartForest<List<String>> init(String key, KV<String, SmartForest<List<String>>> kv,
|
||||
boolean reload) {
|
||||
|
||||
SmartForest<List<String>> forest = kv.getV();
|
||||
|
||||
if (forest != null) {
|
||||
if (reload) {
|
||||
forest.clear();
|
||||
} else {
|
||||
return forest;
|
||||
}
|
||||
} else {
|
||||
forest = new SmartForest<>();
|
||||
}
|
||||
|
||||
LOG.debug("begin init synonyms " + kv.getK());
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
try (BufferedReader reader = IOUtil.getReader(PathToStream.stream(kv.getK()), IOUtil.UTF8)) {
|
||||
String temp = null;
|
||||
while ((temp = reader.readLine()) != null) {
|
||||
if (StringUtil.isBlank(temp)) {
|
||||
continue;
|
||||
}
|
||||
String[] split = temp.split("\t");
|
||||
|
||||
List<String> list = new ArrayList<>();
|
||||
for (String word : split) {
|
||||
if (StringUtil.isBlank(word)) {
|
||||
continue;
|
||||
}
|
||||
list.add(word);
|
||||
}
|
||||
|
||||
if (split.length <= 1) {
|
||||
LOG.warn(temp + " in synonymsLibrary not in to library !");
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < split.length; i++) {
|
||||
forest.add(split[i], list);
|
||||
}
|
||||
}
|
||||
kv.setV(forest);
|
||||
LOG.info("load synonyms use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
|
||||
return forest;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Init synonyms library error :" + e.getMessage() + ", path: " + kv.getK());
|
||||
SYNONYMS.remove(key);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 动态添加
|
||||
*
|
||||
* @param dicDefault
|
||||
* @param dicDefault2
|
||||
* @param dic2
|
||||
*/
|
||||
public static void put(String key, String path) {
|
||||
put(key, path, null);
|
||||
}
|
||||
|
||||
public static void put(String key, String path, SmartForest<List<String>> value) {
|
||||
SYNONYMS.put(key, KV.with(path, value));
|
||||
MyStaticValue.ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除一个key
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static KV<String, SmartForest<List<String>>> remove(String key) {
|
||||
KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key);
|
||||
if (kv != null && kv.getV() != null) { //先清空后删除
|
||||
kv.getV().clear();
|
||||
}
|
||||
MyStaticValue.ENV.remove(key);
|
||||
return SYNONYMS.remove(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* 刷新一个,将值设置为null
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static void reload(String key) {
|
||||
|
||||
if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了
|
||||
remove(key);
|
||||
}
|
||||
|
||||
putIfAbsent(key, MyStaticValue.ENV.get(key));
|
||||
|
||||
KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key);
|
||||
|
||||
init(key, kv, true);
|
||||
}
|
||||
|
||||
public static Set<String> keys() {
|
||||
return SYNONYMS.keySet();
|
||||
}
|
||||
|
||||
public static void putIfAbsent(String key, String path) {
|
||||
if (!SYNONYMS.containsKey(key)) {
|
||||
SYNONYMS.put(key, KV.with(path, (SmartForest<List<String>>) null));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 覆盖更新同义词 [中国, 中华, 我国] -> replace([中国,华夏]) -> [中国,华夏]
|
||||
*
|
||||
* @param words
|
||||
*/
|
||||
public static void insert(String key, String[] words) {
|
||||
SmartForest<List<String>> synonyms = get(key);
|
||||
|
||||
List<String> list = new ArrayList<>();
|
||||
|
||||
for (String word : words) {
|
||||
if (StringUtil.isBlank(word)) {
|
||||
continue;
|
||||
}
|
||||
list.add(word);
|
||||
}
|
||||
|
||||
if (list.size() <= 1) {
|
||||
LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
|
||||
return;
|
||||
}
|
||||
|
||||
Set<String> set = findAllWords(key, words);
|
||||
|
||||
for (String word : list) {
|
||||
set.remove(word);
|
||||
synonyms.add(word, list);
|
||||
}
|
||||
|
||||
for (String word : set) { //删除所有
|
||||
synonyms.remove(word);
|
||||
synonyms.getBranch(word).setParam(null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static Set<String> findAllWords(String key, String[] words) {
|
||||
|
||||
SmartForest<List<String>> synonyms = get(key);
|
||||
|
||||
Set<String> set = new HashSet<>();
|
||||
for (String word : words) {
|
||||
SmartForest<List<String>> branch = synonyms.getBranch(word);
|
||||
if (branch != null) {
|
||||
List<String> params = branch.getParam();
|
||||
if (params != null) {
|
||||
set.addAll(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* 合并更新同义词 覆盖更新同义词 [中国, 中华, 我国] -> append([中国,华夏]) -> [中国, 中华, 我国 , 华夏]
|
||||
*
|
||||
* @param words
|
||||
*/
|
||||
public static void append(String key, String[] words) {
|
||||
|
||||
SmartForest<List<String>> synonyms = get(key);
|
||||
|
||||
Set<String> set = new HashSet<>();
|
||||
|
||||
for (String word : words) {
|
||||
if (StringUtil.isBlank(word)) {
|
||||
continue;
|
||||
}
|
||||
set.add(word);
|
||||
}
|
||||
|
||||
if (set.size() <= 1) {
|
||||
LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
|
||||
return;
|
||||
}
|
||||
|
||||
set.addAll(findAllWords(key, words));
|
||||
|
||||
List<String> list = new ArrayList<>(set);
|
||||
|
||||
for (String word : list) {
|
||||
synonyms.addBranch(word, list);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从同义词组中删除掉一个词 [中国, 中华, 我国] -> remove(我国) -> [中国, 中华]
|
||||
*
|
||||
* @param words
|
||||
*/
|
||||
public static void remove(String key, String word) {
|
||||
|
||||
SmartForest<List<String>> synonyms = get(key);
|
||||
|
||||
SmartForest<List<String>> branch = synonyms.getBranch(word);
|
||||
|
||||
if (branch == null || branch.getStatus() < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<String> params = branch.getParam();
|
||||
|
||||
synonyms.remove(word);
|
||||
branch.setParam(null);
|
||||
params.remove(word);
|
||||
|
||||
if (params.size() == 1) { //如果是1 个也删除
|
||||
synonyms.remove(params.get(0));
|
||||
params.remove(0);
|
||||
} else {
|
||||
params.remove(word);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,73 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library.company;
|
||||
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* 机构名识别词典加载类
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class CompanyAttrLibrary {
|
||||
|
||||
private static final Log logger = LogFactory.getLog();
|
||||
|
||||
private static HashMap<String, int[]> cnMap = null;
|
||||
|
||||
private CompanyAttrLibrary() {}
|
||||
|
||||
public static HashMap<String, int[]> getCompanyMap() {
|
||||
if (cnMap != null) {
|
||||
return cnMap;
|
||||
}
|
||||
init();
|
||||
return cnMap;
|
||||
}
|
||||
|
||||
// company_freq
|
||||
|
||||
private static void init() {
|
||||
try (BufferedReader br = MyStaticValue.getCompanReader()) {
|
||||
cnMap = new HashMap<>();
|
||||
String temp = null;
|
||||
String[] strs = null;
|
||||
int[] cna = null;
|
||||
while ((temp = br.readLine()) != null) {
|
||||
strs = temp.split("\t");
|
||||
cna = new int[2];
|
||||
cna[0] = Integer.parseInt(strs[1]);
|
||||
cna[1] = Integer.parseInt(strs[2]);
|
||||
cnMap.put(strs[0], cna);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,99 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.library.name;
|
||||
|
||||
import org.ansj.domain.PersonNatureAttr;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 人名标注所用的词典就是简单的hashmap简单方便谁用谁知道,只在加载词典的时候用
|
||||
*
|
||||
* @author ansj
|
||||
*/
|
||||
|
||||
public class PersonAttrLibrary {
|
||||
|
||||
private static final Log logger = LogFactory.getLog();
|
||||
|
||||
private HashMap<String, PersonNatureAttr> pnMap = null;
|
||||
|
||||
public PersonAttrLibrary() {}
|
||||
|
||||
public HashMap<String, PersonNatureAttr> getPersonMap() {
|
||||
if (pnMap != null) {
|
||||
return pnMap;
|
||||
}
|
||||
init1();
|
||||
init2();
|
||||
return pnMap;
|
||||
}
|
||||
|
||||
// name_freq
|
||||
private void init2() {
|
||||
Map<String, int[][]> personFreqMap = MyStaticValue.getPersonFreqMap();
|
||||
Set<Entry<String, int[][]>> entrySet = personFreqMap.entrySet();
|
||||
PersonNatureAttr pna = null;
|
||||
for (Entry<String, int[][]> entry : entrySet) {
|
||||
pna = pnMap.get(entry.getKey());
|
||||
if (pna == null) {
|
||||
pna = new PersonNatureAttr();
|
||||
pna.setlocFreq(entry.getValue());
|
||||
pnMap.put(entry.getKey(), pna);
|
||||
} else {
|
||||
pna.setlocFreq(entry.getValue());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// person.dic
|
||||
private void init1() {
|
||||
try (BufferedReader br = MyStaticValue.getPersonReader()) {
|
||||
pnMap = new HashMap<>();
|
||||
String temp = null;
|
||||
String[] strs = null;
|
||||
PersonNatureAttr pna = null;
|
||||
while ((temp = br.readLine()) != null) {
|
||||
pna = new PersonNatureAttr();
|
||||
strs = temp.split("\t");
|
||||
pna = pnMap.get(strs[0]);
|
||||
if (pna == null) {
|
||||
pna = new PersonNatureAttr();
|
||||
}
|
||||
pna.addFreq(Integer.parseInt(strs[1]), Integer.parseInt(strs[2]));
|
||||
pnMap.put(strs[0], pna);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("数字格式不正确", e);
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* 词语结果识别接口,用来通过规则方式识别词语,对结果的二次加工
|
||||
*
|
||||
* @author Ansj
|
||||
*
|
||||
*/
|
||||
public interface Recognition extends Serializable {
|
||||
public void recognition(Result result);
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
|
||||
/**
|
||||
* 词语识别接口,用来识别词语
|
||||
*
|
||||
* @author Ansj
|
||||
*
|
||||
*/
|
||||
public interface TermArrRecognition {
|
||||
public void recognition(Term[] terms);
|
||||
}
|
|
@ -1,197 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.arrimpl;
|
||||
|
||||
import org.ansj.domain.*;
|
||||
import org.ansj.library.NgramLibrary;
|
||||
import org.ansj.recognition.TermArrRecognition;
|
||||
import org.ansj.util.TermUtil;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 人名识别工具类
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class AsianPersonRecognition implements TermArrRecognition {
|
||||
private static final double[] FACTORY = {0.16271366224044456, 0.8060521860870434, 0.031234151672511947};
|
||||
private boolean skip = false;
|
||||
private Term[] terms;
|
||||
|
||||
// 名称是否有歧异
|
||||
// public int B = -1;//0 姓氏
|
||||
// public int C = -1;//1 双名的首字
|
||||
// public int D = -1;//2 双名的末字
|
||||
// public int E = -1;//3 单名
|
||||
// public int N = -1; //4任意字
|
||||
// public int L = -1;//11 人名的下文
|
||||
// public int M = -1;//12 两个中国人名之间的成分
|
||||
// public int m = -1;//44 可拆分的姓名
|
||||
// double[] factory = {"BC", "BCD", "BCDE"}
|
||||
|
||||
@Override
|
||||
public void recognition(Term[] terms) {
|
||||
this.terms = terms;
|
||||
List<Term> termList = recogntion_();
|
||||
for (Term term2 : termList) {
|
||||
TermUtil.insertTerm(terms, term2, InsertTermType.SCORE_ADD_SORT);
|
||||
}
|
||||
}
|
||||
|
||||
private List<Term> recogntion_() {
|
||||
Term term = null;
|
||||
Term tempTerm = null;
|
||||
List<Term> termList = new ArrayList<>();
|
||||
int beginFreq = 10;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
term = terms[i];
|
||||
if (term == null || !term.termNatures().personAttr.flag) {
|
||||
continue;
|
||||
}
|
||||
term.score(0);
|
||||
term.selfScore(0);
|
||||
int freq = 0;
|
||||
for (int j = 2; j > -1; j--) {
|
||||
freq = term.termNatures().personAttr.getFreq(j, 0);
|
||||
if ((freq > 10) || (term.getName().length() == 2 && freq > 10)) {
|
||||
tempTerm = nameFind(i, beginFreq, j);
|
||||
if (tempTerm != null) {
|
||||
termList.add(tempTerm);
|
||||
// 如果是无争议性识别
|
||||
if (skip) {
|
||||
for (int j2 = i; j2 < tempTerm.toValue(); j2++) {
|
||||
if (terms[j2] != null) {
|
||||
terms[j2].score(0);
|
||||
terms[j2].selfScore(0);
|
||||
}
|
||||
}
|
||||
i = tempTerm.toValue() - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
beginFreq = term.termNatures().personAttr.begin + 1;
|
||||
}
|
||||
return termList;
|
||||
}
|
||||
|
||||
/**
|
||||
* 人名识别
|
||||
*
|
||||
* @param term
|
||||
* @param offe
|
||||
* @param freq
|
||||
*/
|
||||
|
||||
private Term nameFind(int offe, int beginFreq, int size) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int undefinite = 0;
|
||||
skip = false;
|
||||
PersonNatureAttr pna = null;
|
||||
int index = 0;
|
||||
int freq = 0;
|
||||
double allFreq = 0;
|
||||
Term term = null;
|
||||
int i = offe;
|
||||
for (; i < terms.length; i++) {
|
||||
// 走到结尾处识别出来一个名字.
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
}
|
||||
term = terms[i];
|
||||
pna = term.termNatures().personAttr;
|
||||
// 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环
|
||||
if ((freq = pna.getFreq(size, index)) == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (pna.allFreq > 0) {
|
||||
undefinite++;
|
||||
}
|
||||
sb.append(term.getName());
|
||||
allFreq += Math.log(term.termNatures().allFreq + 1);
|
||||
allFreq += -Math.log((freq));
|
||||
index++;
|
||||
|
||||
if (index == size + 2) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double score = -Math.log(FACTORY[size]);
|
||||
score += allFreq;
|
||||
double endFreq = 0;
|
||||
// 开始寻找结尾词
|
||||
boolean flag = true;
|
||||
while (flag) {
|
||||
i++;
|
||||
if (i >= terms.length) {
|
||||
endFreq = 10;
|
||||
flag = false;
|
||||
} else if (terms[i] != null) {
|
||||
int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]);
|
||||
if (twoWordFreq > 3) {
|
||||
return null;
|
||||
}
|
||||
endFreq = terms[i].termNatures().personAttr.end + 1;
|
||||
flag = false;
|
||||
}
|
||||
}
|
||||
|
||||
score -= Math.log(endFreq);
|
||||
score -= Math.log(beginFreq);
|
||||
|
||||
if (score > -3) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (allFreq > 0 && undefinite > 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
skip = undefinite == 0;
|
||||
term = new Term(sb.toString(), offe, TermNatures.NR);
|
||||
term.selfScore(score);
|
||||
|
||||
return term;
|
||||
|
||||
}
|
||||
|
||||
public List<NewWord> getNewWords(Term[] terms) {
|
||||
this.terms = terms;
|
||||
List<NewWord> all = new ArrayList<>();
|
||||
List<Term> termList = recogntion_();
|
||||
for (Term term2 : termList) {
|
||||
all.add(new NewWord(term2.getName(), Nature.NR));
|
||||
}
|
||||
return all;
|
||||
}
|
||||
|
||||
public List<Term> getNewTerms() {
|
||||
return recogntion_();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,248 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.arrimpl;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.NewWord;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.recognition.TermArrRecognition;
|
||||
import org.ansj.util.TermUtil;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* 外国人名识别
|
||||
*
|
||||
* @author ansj
|
||||
*/
|
||||
public class ForeignPersonRecognition implements TermArrRecognition {
|
||||
|
||||
private static final LinkedList<NameChar> PRLIST = new LinkedList<>();
|
||||
|
||||
private static NameChar INNAME = null;
|
||||
|
||||
private static HashSet<Character> ISNOTFIRST = new HashSet<>();
|
||||
|
||||
static {
|
||||
NameChar trans_english = new NameChar(StringUtil.sortCharArray(
|
||||
"·-—阿埃艾爱安昂敖奥澳笆芭巴白拜班邦保堡鲍北贝本比毕彼别波玻博勃伯泊卜布才采仓查差柴彻川茨慈次达大戴代丹旦但当道德得登迪狄蒂帝丁东杜敦多额俄厄鄂恩尔伐法范菲芬费佛夫福弗甫噶盖干冈哥戈革葛格各根古瓜哈海罕翰汗汉豪合河赫亨侯呼胡华霍基吉及加贾坚简杰金京久居君喀卡凯坎康考柯科可克肯库奎拉喇莱来兰郎朗劳勒雷累楞黎理李里莉丽历利立力连廉良列烈林隆卢虏鲁路伦仑罗洛玛马买麦迈曼茅茂梅门蒙盟米蜜密敏明摩莫墨默姆木穆那娜纳乃奈南内尼年涅宁纽努诺欧帕潘畔庞培佩彭皮平泼普其契恰强乔切钦沁泉让热荣肉儒瑞若萨塞赛桑瑟森莎沙山善绍舍圣施诗石什史士守斯司丝苏素索塔泰坦汤唐陶特提汀图土吐托陀瓦万王旺威韦维魏温文翁沃乌吾武伍西锡希喜夏相香歇谢辛新牙雅亚彦尧叶依伊衣宜义因音英雍尤于约宰泽增詹珍治中仲朱诸卓孜祖佐伽娅尕腓滕济嘉津赖莲琳律略慕妮聂裴浦奇齐琴茹珊卫欣逊札哲智兹芙汶迦珀琪梵斐胥黛"));
|
||||
NameChar trans_russian = new NameChar(StringUtil.sortCharArray(
|
||||
"·-阿安奥巴比彼波布察茨大德得丁杜尔法夫伏甫盖格哈基加坚捷金卡科可克库拉莱兰勒雷里历利连列卢鲁罗洛马梅蒙米姆娜涅宁诺帕泼普奇齐乔切日萨色山申什斯索塔坦特托娃维文乌西希谢亚耶叶依伊以扎佐柴达登蒂戈果海赫华霍吉季津柯理琳玛曼穆纳尼契钦丘桑沙舍泰图瓦万雅卓兹"));
|
||||
// 注释掉了日本人名.表面上是抵制日货.背地里是处理不好..
|
||||
// NameChar trans_japanese = new NameChar(
|
||||
// StringUtil
|
||||
// .sortCharArray("安奥八白百邦保北倍本比滨博步部彩菜仓昌长朝池赤川船淳次村大代岛稻道德地典渡尔繁饭风福冈高工宫古谷关广桂贵好浩和合河黑横恒宏后户荒绘吉纪佳加见健江介金今进井静敬靖久酒菊俊康可克口梨理里礼栗丽利立凉良林玲铃柳隆鹿麻玛美萌弥敏木纳南男内鸟宁朋片平崎齐千前浅桥琴青清庆秋丘曲泉仁忍日荣若三森纱杉山善上伸神圣石实矢世市室水顺司松泰桃藤天田土万望尾未文武五舞西细夏宪相小孝新星行雄秀雅亚岩杨洋阳遥野也叶一伊衣逸义益樱永由有佑宇羽郁渊元垣原远月悦早造则泽增扎宅章昭沼真政枝知之植智治中忠仲竹助椎子佐阪坂堀荻菅薰浜濑鸠筱"));
|
||||
PRLIST.add(trans_english);
|
||||
PRLIST.add(trans_russian);
|
||||
// PRLIST.add(trans_japanese);
|
||||
|
||||
INNAME = new NameChar(StringUtil.sortCharArray(
|
||||
"-·—丁万丘东丝中丹丽乃久义乌乔买于亚亨京什仑仓代以仲伊伍伏伐伦伯伽但佐佛佩依侯俄保儒克兰其兹内冈凯切列利别力加努劳勃勒北华卓南博卜卡卢卫厄历及古可史叶司各合吉吐君吾呼哈哥哲唐喀善喇喜嘉噶因图土圣坎坚坦埃培基堡塔塞增墨士夏多大夫奇奈奎契奥妮姆威娃娅娜孜季宁守安宜宰密察尔尕尤尧尼居山川差巴布希帕帝干平年库庞康廉弗强当彦彭彻彼律得德恩恰慈慕戈戴才扎托拉拜捷提摩敏敖敦文斐斯新施日旦旺昂明普智曼朗木本札朱李杜来杰林果查柯柴根格桑梅梵森楞次欣欧歇武比毕汀汉汗汤汶沁沃沙河治泉泊法波泰泼泽洛津济浦海涅温滕潘澳烈热爱牙特狄王玛玻珀珊珍班理琪琳琴瑞瑟瓜瓦甫申畔略登白皮盖盟相石祖福科穆立笆简米素索累约纳纽绍维罕罗翁翰考耶聂肉肯胡胥腓舍良色艾芙芬芭苏若英茂范茅茨茹荣莉莎莫莱莲菲萨葛蒂蒙虏蜜衣裴西詹让诗诸诺谢豪贝费贾赖赛赫路辛达迈连迦迪逊道那邦郎鄂采里金钦锡门阿陀陶隆雅雍雷霍革韦音额香马魏鲁鲍麦黎默黛齐"));
|
||||
|
||||
ISNOTFIRST.add('-');
|
||||
ISNOTFIRST.add('·');
|
||||
ISNOTFIRST.add('—');
|
||||
}
|
||||
|
||||
private List<Term> tempList = new ArrayList<>();
|
||||
private LinkedList<NameChar> prList = null;
|
||||
private Term[] terms = null;
|
||||
|
||||
@Override
|
||||
public void recognition(Term[] terms) {
|
||||
this.terms = terms;
|
||||
String name = null;
|
||||
Term term = null;
|
||||
reset();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
term = terms[i];
|
||||
// 如果名字的开始是人名的前缀,或者后缀.那么忽略
|
||||
if (tempList.isEmpty()) {
|
||||
if (term.termNatures().personAttr.end > 10) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
name = term.getName();
|
||||
|
||||
if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
|
||||
boolean flag = validate(name);
|
||||
if (flag) {
|
||||
tempList.add(term);
|
||||
}
|
||||
} else if (tempList.size() == 1) {
|
||||
reset();
|
||||
} else if (tempList.size() > 1) {
|
||||
TermUtil.insertTerm(terms, tempList, TermNatures.NR);
|
||||
reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean validate(String name) {
|
||||
boolean flag = false;
|
||||
NameChar nameChar = null;
|
||||
for (int j = 0; j < prList.size(); j++) {
|
||||
nameChar = prList.get(j);
|
||||
if (nameChar.contains(name)) {
|
||||
flag = true;
|
||||
} else {
|
||||
prList.remove(j);
|
||||
// 向后回退一位
|
||||
j--;
|
||||
}
|
||||
}
|
||||
return flag;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void reset() {
|
||||
|
||||
tempList.clear();
|
||||
prList = (LinkedList<NameChar>) PRLIST.clone();
|
||||
}
|
||||
|
||||
public static boolean isFName(String name) {
|
||||
for (int i = 0; i < name.length(); i++) {
|
||||
if (!INNAME.contains(name.charAt(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static class NameChar {
|
||||
private char[] chars = null;
|
||||
|
||||
public NameChar(char[] chars) {
|
||||
this.chars = chars;
|
||||
}
|
||||
|
||||
public boolean contains(String name) {
|
||||
return contains(name.charAt(0));
|
||||
}
|
||||
|
||||
public boolean contains(char c) {
|
||||
return Arrays.binarySearch(chars, c) > -1;
|
||||
}
|
||||
}
|
||||
|
||||
public List<NewWord> getNewWords(Term[] terms) {
|
||||
this.terms = terms;
|
||||
List<NewWord> all = new ArrayList<>();
|
||||
String name = null;
|
||||
Term term = null;
|
||||
reset();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
term = terms[i];
|
||||
// 如果名字的开始是人名的前缀,或者后缀.那么忽略
|
||||
if (tempList.isEmpty()) {
|
||||
if (term.termNatures().personAttr.end > 10) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
name = term.getName();
|
||||
if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
|
||||
boolean flag = validate(name);
|
||||
if (flag) {
|
||||
tempList.add(term);
|
||||
}
|
||||
} else if (tempList.size() == 1) {
|
||||
reset();
|
||||
} else if (tempList.size() > 1) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Term temp : tempList) {
|
||||
sb.append(temp.getName());
|
||||
}
|
||||
all.add(new NewWord(sb.toString(), Nature.NRF));
|
||||
reset();
|
||||
}
|
||||
}
|
||||
return all;
|
||||
}
|
||||
|
||||
public List<Term> getNewTerms() {
|
||||
LinkedList<Term> result = new LinkedList<>();
|
||||
String name = null;
|
||||
Term term = null;
|
||||
reset();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
term = terms[i];
|
||||
// 如果名字的开始是人名的前缀,或者后缀.那么忽略
|
||||
if (tempList.isEmpty()) {
|
||||
if (term.termNatures().personAttr.end > 10) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
name = term.getName();
|
||||
|
||||
if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
|
||||
boolean flag = validate(name);
|
||||
if (flag) {
|
||||
tempList.add(term);
|
||||
}
|
||||
} else if (tempList.size() == 1) {
|
||||
reset();
|
||||
} else if (tempList.size() > 1) {
|
||||
result.add(makeNewTerm());
|
||||
reset();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public Term makeNewTerm() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int offe = tempList.get(0).getOffe();
|
||||
for (Term term : tempList) {
|
||||
sb.append(term.getName());
|
||||
}
|
||||
return new Term(sb.toString(), offe, TermNatures.NR);
|
||||
}
|
||||
}
|
|
@ -1,158 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.arrimpl;
|
||||
|
||||
import org.ansj.dic.LearnTool;
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.NewWord;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.util.TermUtil;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
|
||||
/**
|
||||
* 新词识别
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class NewWordRecognition {
|
||||
|
||||
private Term[] terms = null;
|
||||
|
||||
private double score;
|
||||
|
||||
private StringBuilder sb = new StringBuilder();
|
||||
|
||||
private SmartForest<NewWord> forest = null;
|
||||
|
||||
private SmartForest<NewWord> branch = null;
|
||||
|
||||
// private int offe = -1;
|
||||
// private int endOffe = -1;
|
||||
private Nature tempNature;
|
||||
|
||||
private Term from;
|
||||
|
||||
private Term to;
|
||||
|
||||
// 偏移量
|
||||
private int offe;
|
||||
|
||||
public NewWordRecognition(LearnTool learn) {
|
||||
forest = learn.getForest();
|
||||
branch = learn.getForest();
|
||||
}
|
||||
|
||||
public void recognition(Term[] terms) {
|
||||
this.terms = terms;
|
||||
if (branch == null) {
|
||||
return;
|
||||
}
|
||||
int length = terms.length - 1;
|
||||
|
||||
Term term = null;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
} else {
|
||||
from = terms[i].from();
|
||||
terms[i].score(0);
|
||||
terms[i].selfScore(0);
|
||||
}
|
||||
|
||||
branch = branch.getBranch(terms[i].getName());
|
||||
|
||||
if (branch == null || branch.getStatus() == 3) {
|
||||
reset();
|
||||
continue;
|
||||
}
|
||||
|
||||
offe = i;
|
||||
|
||||
// 循环查找添加
|
||||
term = terms[i];
|
||||
sb.append(term.getName());
|
||||
if (branch.getStatus() == 2) {
|
||||
term.selfScore(branch.getParam().getScore());
|
||||
}
|
||||
boolean flag = true;
|
||||
while (flag) {
|
||||
term = term.to();
|
||||
branch = branch.getBranch(term.getName());
|
||||
// 如果没有找到跳出
|
||||
if (branch == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
switch (branch.getStatus()) {
|
||||
case 1:
|
||||
sb.append(term.getName());
|
||||
continue;
|
||||
case 2:
|
||||
sb.append(term.getName());
|
||||
score = branch.getParam().getScore();
|
||||
tempNature = branch.getParam().getNature();
|
||||
to = term.to();
|
||||
makeNewTerm();
|
||||
continue;
|
||||
case 3:
|
||||
sb.append(term.getName());
|
||||
score = branch.getParam().getScore();
|
||||
tempNature = branch.getParam().getNature();
|
||||
to = term.to();
|
||||
makeNewTerm();
|
||||
flag = false;
|
||||
break;
|
||||
default:
|
||||
System.out.println("怎么能出现0呢?");
|
||||
break;
|
||||
}
|
||||
}
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
private void makeNewTerm() {
|
||||
Term term = new Term(sb.toString(), offe, tempNature.natureStr, 1);
|
||||
term.selfScore(score);
|
||||
term.setNature(tempNature);
|
||||
if (sb.length() > 3) {
|
||||
term.setSubTerm(TermUtil.getSubTerm(from, to));
|
||||
}
|
||||
TermUtil.termLink(from, term);
|
||||
TermUtil.termLink(term, to);
|
||||
TermUtil.insertTerm(terms, term, InsertTermType.SCORE_ADD_SORT);
|
||||
TermUtil.parseNature(term);
|
||||
}
|
||||
|
||||
/**
|
||||
* 重置
|
||||
*/
|
||||
private void reset() {
|
||||
offe = -1;
|
||||
tempNature = null;
|
||||
branch = forest;
|
||||
score = 0;
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,84 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.arrimpl;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.TermArrRecognition;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.ansj.util.TermUtil;
|
||||
|
||||
public class NumRecognition implements TermArrRecognition {
|
||||
|
||||
/**
|
||||
* 数字+数字合并,zheng
|
||||
*
|
||||
* @param terms
|
||||
*/
|
||||
@Override
|
||||
public void recognition(Term[] terms) {
|
||||
int length = terms.length - 1;
|
||||
Term from = null;
|
||||
Term to = null;
|
||||
Term temp = null;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
} else if (".".equals(terms[i].getName()) || ".".equals(terms[i].getName())) {
|
||||
// 如果是.前后都为数字进行特殊处理
|
||||
to = terms[i].to();
|
||||
from = terms[i].from();
|
||||
if (from.termNatures().numAttr.flag && to.termNatures().numAttr.flag) {
|
||||
from.setName(from.getName() + "." + to.getName());
|
||||
TermUtil.termLink(from, to.to());
|
||||
terms[to.getOffe()] = null;
|
||||
terms[i] = null;
|
||||
i = from.getOffe() - 1;
|
||||
}
|
||||
continue;
|
||||
} else if (!terms[i].termNatures().numAttr.flag) {
|
||||
continue;
|
||||
}
|
||||
|
||||
temp = terms[i];
|
||||
// 将所有的数字合并
|
||||
while ((temp = temp.to()).termNatures().numAttr.flag) {
|
||||
terms[i].setName(terms[i].getName() + temp.getName());
|
||||
}
|
||||
// 如果是数字结尾
|
||||
if (MyStaticValue.isQuantifierRecognition && temp.termNatures().numAttr.numEndFreq > 0) {
|
||||
terms[i].setName(terms[i].getName() + temp.getName());
|
||||
temp = temp.to();
|
||||
}
|
||||
|
||||
// 如果不等,说明terms[i]发生了改变
|
||||
if (terms[i].to() != temp) {
|
||||
TermUtil.termLink(terms[i], temp);
|
||||
// 将中间无用元素设置为null
|
||||
for (int j = i + 1; j < temp.getOffe(); j++) {
|
||||
terms[j] = null;
|
||||
}
|
||||
i = temp.getOffe() - 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,185 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.arrimpl;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.domain.TermNature;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.library.DicLibrary;
|
||||
import org.ansj.recognition.TermArrRecognition;
|
||||
import org.ansj.util.TermUtil;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* 用户自定义词典.又称补充词典
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class UserDefineRecognition implements TermArrRecognition {
|
||||
|
||||
public static final Log logger = LogFactory.getLog(UserDefineRecognition.class);
|
||||
|
||||
private Term[] terms = null;
|
||||
|
||||
private Forest[] forests = {DicLibrary.get()};
|
||||
|
||||
private int offe = -1;
|
||||
private int endOffe = -1;
|
||||
private int tempFreq = 50;
|
||||
private String tempNature;
|
||||
|
||||
private SmartForest<String[]> branch = null;
|
||||
private SmartForest<String[]> forest = null;
|
||||
|
||||
private InsertTermType type = InsertTermType.SKIP;
|
||||
|
||||
public UserDefineRecognition(InsertTermType type, Forest... forests) {
|
||||
this.type = type;
|
||||
if (forests != null && forests.length > 0) {
|
||||
this.forests = forests;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Term[] terms) {
|
||||
this.terms = terms;
|
||||
for (Forest forest : forests) {
|
||||
if (forest == null) {
|
||||
continue;
|
||||
}
|
||||
reset();
|
||||
this.forest = forest;
|
||||
|
||||
branch = forest;
|
||||
|
||||
int length = terms.length - 1;
|
||||
|
||||
boolean flag = true;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (terms[i] == null)
|
||||
continue;
|
||||
if (branch == forest) {
|
||||
flag = false;
|
||||
} else {
|
||||
flag = true;
|
||||
}
|
||||
|
||||
branch = termStatus(branch, terms[i]);
|
||||
if (branch == null) {
|
||||
if (offe != -1) {
|
||||
i = offe;
|
||||
}
|
||||
reset();
|
||||
} else if (branch.getStatus() == 3) {
|
||||
endOffe = i;
|
||||
tempNature = branch.getParam()[0];
|
||||
tempFreq = getInt(branch.getParam()[1], 50);
|
||||
if (offe != -1 && offe < endOffe) {
|
||||
i = offe;
|
||||
makeNewTerm();
|
||||
reset();
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
} else if (branch.getStatus() == 2) {
|
||||
endOffe = i;
|
||||
if (offe == -1) {
|
||||
offe = i;
|
||||
} else {
|
||||
tempNature = branch.getParam()[0];
|
||||
tempFreq = getInt(branch.getParam()[1], 50);
|
||||
if (flag) {
|
||||
makeNewTerm();
|
||||
}
|
||||
}
|
||||
} else if (branch.getStatus() == 1) {
|
||||
if (offe == -1) {
|
||||
offe = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (offe != -1 && offe < endOffe) {
|
||||
makeNewTerm();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int getInt(String str, int def) {
|
||||
try {
|
||||
return Integer.parseInt(str);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn(str + "不是一个数字", e);
|
||||
return def;
|
||||
}
|
||||
}
|
||||
|
||||
private void makeNewTerm() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int j = offe; j <= endOffe; j++) {
|
||||
if (terms[j] == null) {
|
||||
continue;
|
||||
} else {
|
||||
sb.append(terms[j].getName());
|
||||
}
|
||||
}
|
||||
TermNatures termNatures = new TermNatures(new TermNature(tempNature, tempFreq));
|
||||
Term term = new Term(sb.toString(), offe, termNatures);
|
||||
term.selfScore(-1 * tempFreq);
|
||||
TermUtil.insertTerm(terms, term, type);
|
||||
}
|
||||
|
||||
/**
|
||||
* 重置
|
||||
*/
|
||||
private void reset() {
|
||||
offe = -1;
|
||||
endOffe = -1;
|
||||
tempFreq = 50;
|
||||
tempNature = null;
|
||||
branch = forest;
|
||||
}
|
||||
|
||||
/**
|
||||
* 传入一个term 返回这个term的状态
|
||||
*
|
||||
* @param branch
|
||||
* @param term
|
||||
* @return
|
||||
*/
|
||||
private SmartForest<String[]> termStatus(SmartForest<String[]> branch, Term term) {
|
||||
String name = term.getName();
|
||||
SmartForest<String[]> sf = branch;
|
||||
for (int j = 0; j < name.length(); j++) {
|
||||
sf = sf.get(name.charAt(j));
|
||||
if (sf == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return sf;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.Recognition;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 基于规则的新词发现 jijiang feidiao
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class BookRecognition implements Recognition {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private static final Nature nature = new Nature("book");
|
||||
|
||||
private static Map<String, String> ruleMap = new HashMap<>();
|
||||
|
||||
static {
|
||||
ruleMap.put("《", "》");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
List<Term> terms = result.getTerms();
|
||||
String end = null;
|
||||
String name;
|
||||
|
||||
LinkedList<Term> mergeList = null;
|
||||
|
||||
List<Term> list = new LinkedList<>();
|
||||
|
||||
for (Term term : terms) {
|
||||
name = term.getName();
|
||||
if (end == null) {
|
||||
if ((end = ruleMap.get(name)) != null) {
|
||||
mergeList = new LinkedList<>();
|
||||
mergeList.add(term);
|
||||
} else {
|
||||
list.add(term);
|
||||
}
|
||||
} else {
|
||||
mergeList.add(term);
|
||||
if (end.equals(name)) {
|
||||
|
||||
Term ft = mergeList.pollFirst();
|
||||
for (Term sub : mergeList) {
|
||||
ft.merage(sub);
|
||||
}
|
||||
ft.setNature(nature);
|
||||
list.add(ft);
|
||||
mergeList = null;
|
||||
end = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mergeList != null) {
|
||||
for (Term term : list) {
|
||||
list.add(term);
|
||||
}
|
||||
}
|
||||
|
||||
result.setTerms(list);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,71 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.library.DicLibrary;
|
||||
import org.ansj.recognition.Recognition;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DicRecognition implements Recognition {
|
||||
|
||||
private static final long serialVersionUID = 7487741700410080896L;
|
||||
|
||||
private Forest[] forests = null;
|
||||
|
||||
public DicRecognition() {
|
||||
forests = DicLibrary.gets(DicLibrary.DEFAULT);
|
||||
}
|
||||
|
||||
public DicRecognition(String[] keys) {
|
||||
forests = DicLibrary.gets(keys);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param forests
|
||||
*/
|
||||
public DicRecognition(Forest[] forests) {
|
||||
this.forests = forests;
|
||||
}
|
||||
|
||||
public DicRecognition(Forest forest) {
|
||||
this.forests = new Forest[] {forest};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
for (Forest forest : forests) {
|
||||
if (forest == null) {
|
||||
continue;
|
||||
}
|
||||
recognition(result, forest);
|
||||
}
|
||||
}
|
||||
|
||||
private void recognition(Result result, Forest forest) {
|
||||
List<Term> terms = result.getTerms();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.Recognition;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 电子邮箱抽取
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class EmailRecognition implements Recognition {
|
||||
|
||||
private static Map<String, String> FEATURE = new HashMap<>();
|
||||
|
||||
private static final String NOT_HEAD = "NOT";
|
||||
private static final String NATURE_HEAD = "nature:";
|
||||
private static final String ALL = "ALL";
|
||||
|
||||
static {
|
||||
FEATURE.put("-", NOT_HEAD);
|
||||
FEATURE.put("_", NOT_HEAD);
|
||||
FEATURE.put(".", NOT_HEAD);
|
||||
FEATURE.put(NATURE_HEAD + "en", ALL);
|
||||
FEATURE.put(NATURE_HEAD + "m", ALL);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
|
||||
List<Term> terms = result.getTerms();
|
||||
|
||||
for (Term term : terms) {
|
||||
if (!"@".equals(term.getName())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (Iterator<Term> iterator = terms.iterator(); iterator.hasNext();) {
|
||||
Term term = iterator.next();
|
||||
if (term.getName() == null) {
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.Recognition;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 基于规则的新词发现,身份证号码识别
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class IDCardRecognition implements Recognition {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -32133440735240290L;
|
||||
private static final Nature ID_CARD_NATURE = new Nature("idcard");
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
|
||||
List<Term> terms = result.getTerms();
|
||||
|
||||
for (Term term : terms) {
|
||||
if ("m".equals(term.getNatureStr())) {
|
||||
|
||||
if (term.getName().length() == 18) {
|
||||
term.setNature(ID_CARD_NATURE);
|
||||
} else if (term.getName().length() == 17) {
|
||||
Term to = term.to();
|
||||
if ("x".equals(to.getName())) {
|
||||
term.merage(to);
|
||||
to.setName(null);
|
||||
term.setNature(ID_CARD_NATURE);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for (Iterator<Term> iterator = terms.iterator(); iterator.hasNext();) {
|
||||
Term term = iterator.next();
|
||||
if (term.getName() == null) {
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,306 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.*;
|
||||
import org.ansj.library.DATDictionary;
|
||||
import org.ansj.library.DicLibrary;
|
||||
import org.ansj.recognition.Recognition;
|
||||
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
|
||||
import org.ansj.splitWord.analysis.ToAnalysis;
|
||||
import org.ansj.util.MathUtil;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.WordAlert;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 词性标注工具类
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class NatureRecognition implements Recognition {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private static final Log logger = LogFactory.getLog();
|
||||
|
||||
private static final Forest SUFFIX_FOREST = new Forest();
|
||||
|
||||
private Forest[] forests = null;
|
||||
|
||||
static {
|
||||
try (BufferedReader reader = MyStaticValue.getNatureClassSuffix()) {
|
||||
String temp = null;
|
||||
while ((temp = reader.readLine()) != null) {
|
||||
String[] split = temp.split("\t");
|
||||
String word = split[0];
|
||||
if (word.length() > 1) {
|
||||
word = new StringBuffer(word).reverse().toString();
|
||||
}
|
||||
SUFFIX_FOREST.add(word, new String[] {split[1]});
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("IO异常", e);
|
||||
}
|
||||
}
|
||||
|
||||
public NatureRecognition() {
|
||||
forests = new Forest[] {DicLibrary.get()};
|
||||
}
|
||||
|
||||
public NatureRecognition(Forest... forests) {
|
||||
this.forests = forests;
|
||||
}
|
||||
|
||||
private NatureTerm root = new NatureTerm(TermNature.BEGIN);
|
||||
|
||||
private NatureTerm[] end = {new NatureTerm(TermNature.END)};
|
||||
|
||||
private List<Term> terms = null;
|
||||
|
||||
private NatureTerm[][] natureTermTable = null;
|
||||
|
||||
/**
|
||||
* 进行最佳词性查找,引用赋值.所以不需要有返回值
|
||||
*/
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
this.terms = result.getTerms();
|
||||
natureTermTable = new NatureTerm[terms.size() + 1][];
|
||||
natureTermTable[terms.size()] = end;
|
||||
|
||||
int length = terms.size();
|
||||
for (int i = 0; i < length; i++) {
|
||||
natureTermTable[i] = getNatureTermArr(terms.get(i).termNatures().termNatures);
|
||||
}
|
||||
walk();
|
||||
}
|
||||
|
||||
/**
|
||||
* 传入一组。词对词语进行。词性标注
|
||||
*
|
||||
* @param words
|
||||
* @param offe
|
||||
* @return
|
||||
*/
|
||||
public List<Term> recognition(List<String> words) {
|
||||
return recognition(words, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* 传入一组。词对词语进行。词性标注
|
||||
*
|
||||
* @param words
|
||||
* @param offe
|
||||
* @return
|
||||
*/
|
||||
public List<Term> recognition(List<String> words, int offe) {
|
||||
List<Term> terms = new ArrayList<>(words.size());
|
||||
int tempOffe = 0;
|
||||
for (String word : words) {
|
||||
TermNatures tn = getTermNatures(word);
|
||||
|
||||
terms.add(new Term(word, offe + tempOffe, tn));
|
||||
tempOffe += word.length();
|
||||
}
|
||||
new NatureRecognition().recognition(new Result(terms));
|
||||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* 传入一次词语获得相关的词性
|
||||
*
|
||||
* @param word
|
||||
* @return
|
||||
*/
|
||||
public TermNatures getTermNatures(String word) {
|
||||
String[] params = null;
|
||||
// 获得词性 , 先从系统辞典。在从用户自定义辞典
|
||||
AnsjItem ansjItem = DATDictionary.getItem(word);
|
||||
TermNatures tn = null;
|
||||
|
||||
if (ansjItem != AnsjItem.NULL) {
|
||||
tn = ansjItem.termNatures;
|
||||
} else if ((params = getParams(word)) != null) {
|
||||
tn = new TermNatures(new TermNature(params[0], 1));
|
||||
} else if (WordAlert.isEnglish(word)) {
|
||||
tn = TermNatures.EN;
|
||||
} else if (WordAlert.isNumber(word)) {
|
||||
tn = TermNatures.M;
|
||||
} else {
|
||||
tn = TermNatures.NULL;
|
||||
}
|
||||
return tn;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取一个词语的参数
|
||||
*
|
||||
* @param word
|
||||
* @return
|
||||
*/
|
||||
public String[] getParams(String word) {
|
||||
for (Forest forest : forests) {
|
||||
if (forest == null) {
|
||||
continue;
|
||||
}
|
||||
SmartForest<String[]> sf = forest;
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
sf = sf.get(word.charAt(i));
|
||||
if (sf == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
if (sf.getStatus() > 1) {
|
||||
return sf.getParam();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 通过规则 猜测词性
|
||||
*
|
||||
* @param word
|
||||
* @return
|
||||
*/
|
||||
public static TermNatures guessNature(String word) {
|
||||
String nature = null;
|
||||
SmartForest<String[]> smartForest = SUFFIX_FOREST;
|
||||
int len = 0;
|
||||
for (int i = word.length() - 1; i >= 0; i--) {
|
||||
smartForest = smartForest.get(word.charAt(i));
|
||||
if (smartForest == null) {
|
||||
break;
|
||||
}
|
||||
len++;
|
||||
if (smartForest.getStatus() == 2) {
|
||||
nature = smartForest.getParam()[0];
|
||||
} else if (smartForest.getStatus() == 3) {
|
||||
nature = smartForest.getParam()[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
|
||||
return TermNatures.NT;
|
||||
} else if ("ns".equals(nature)) {
|
||||
return TermNatures.NS;
|
||||
} else if (word.length() < 5) {
|
||||
Result parse = ToAnalysis.parse(word);
|
||||
for (Term term : parse.getTerms()) {
|
||||
if ("nr".equals(term.getNatureStr())) {
|
||||
return TermNatures.NR;
|
||||
}
|
||||
}
|
||||
} else if (ForeignPersonRecognition.isFName(word)) {
|
||||
return TermNatures.NRF;
|
||||
}
|
||||
|
||||
return TermNatures.NW;
|
||||
}
|
||||
|
||||
public void walk() {
|
||||
int length = natureTermTable.length - 1;
|
||||
setScore(root, natureTermTable[0]);
|
||||
for (int i = 0; i < length; i++) {
|
||||
for (int j = 0; j < natureTermTable[i].length; j++) {
|
||||
setScore(natureTermTable[i][j], natureTermTable[i + 1]);
|
||||
}
|
||||
}
|
||||
optimalRoot();
|
||||
}
|
||||
|
||||
private void setScore(NatureTerm natureTerm, NatureTerm[] natureTerms) {
|
||||
|
||||
for (int i = 0; i < natureTerms.length; i++) {
|
||||
natureTerms[i].setScore(natureTerm);
|
||||
}
|
||||
}
|
||||
|
||||
private NatureTerm[] getNatureTermArr(TermNature[] termNatures) {
|
||||
NatureTerm[] natureTerms = new NatureTerm[termNatures.length];
|
||||
for (int i = 0; i < natureTerms.length; i++) {
|
||||
natureTerms[i] = new NatureTerm(termNatures[i]);
|
||||
}
|
||||
return natureTerms;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得最优路径
|
||||
*/
|
||||
private void optimalRoot() {
|
||||
NatureTerm to = end[0];
|
||||
NatureTerm from = null;
|
||||
int index = natureTermTable.length - 1;
|
||||
while ((from = to.from) != null && index > 0) {
|
||||
terms.get(--index).setNature(from.termNature.nature);
|
||||
to = from;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 关于这个term的词性
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class NatureTerm {
|
||||
|
||||
public TermNature termNature;
|
||||
|
||||
public double score = 0;
|
||||
|
||||
public double selfScore;
|
||||
|
||||
public NatureTerm from;
|
||||
|
||||
protected NatureTerm(TermNature termNature) {
|
||||
this.termNature = termNature;
|
||||
selfScore = termNature.frequency + 1;
|
||||
}
|
||||
|
||||
public void setScore(NatureTerm natureTerm) {
|
||||
double tempScore = MathUtil.compuNatureFreq(natureTerm, this);
|
||||
if (from == null || score < tempScore) {
|
||||
this.score = tempScore;
|
||||
this.from = natureTerm;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return termNature.nature.natureStr + "/" + selfScore;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,151 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.Recognition;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* 对结果增加过滤,支持词性过滤,和词语过滤.
|
||||
*
|
||||
* @author Ansj
|
||||
*
|
||||
*/
|
||||
public class StopRecognition implements Recognition {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog();
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 7041503137429986566L;
|
||||
|
||||
private Set<String> stop = new HashSet<>();
|
||||
|
||||
private Set<String> natureStop = new HashSet<>();
|
||||
|
||||
private Set<Pattern> regexList = new HashSet<>();
|
||||
|
||||
/**
|
||||
* 批量增加停用词
|
||||
*
|
||||
* @param filterWords
|
||||
* @return
|
||||
*/
|
||||
public StopRecognition insertStopWords(Collection<String> filterWords) {
|
||||
stop.addAll(filterWords);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量增加停用词
|
||||
*
|
||||
* @param stopWords
|
||||
* @return
|
||||
*/
|
||||
public StopRecognition insertStopWords(String... stopWords) {
|
||||
for (String words : stopWords) {
|
||||
stop.add(words);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量增加停用词性 比如 增加nr 后.人名将不在结果中
|
||||
*
|
||||
* @param stopWords
|
||||
*/
|
||||
public void insertStopNatures(String... stopNatures) {
|
||||
for (String natureStr : stopNatures) {
|
||||
natureStop.add(natureStr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加正则表达式过滤
|
||||
*
|
||||
* @param regex
|
||||
*/
|
||||
public void insertStopRegexes(String... regexes) {
|
||||
for (String regex : regexes) {
|
||||
try {
|
||||
regexList.add(Pattern.compile(regex));
|
||||
} catch (Exception e) {
|
||||
LOG.error("regex err : " + regex, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
List<Term> list = result.getTerms();
|
||||
Iterator<Term> iterator = list.iterator();
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
Term term = iterator.next();
|
||||
if (filter(term)) {
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断一个词语是否停用..
|
||||
*
|
||||
* @param term
|
||||
* @return
|
||||
*/
|
||||
public boolean filter(Term term) {
|
||||
|
||||
if (!stop.isEmpty() && (stop.contains(term.getName()))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!natureStop.isEmpty() && (natureStop.contains(term.natrue().natureStr))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!regexList.isEmpty()) {
|
||||
for (Pattern stopwordPattern : regexList) {
|
||||
if (stopwordPattern.matcher(term.getName()).matches()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
this.stop.clear();
|
||||
this.natureStop.clear();
|
||||
this.regexList.clear();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.library.SynonymsLibrary;
|
||||
import org.ansj.recognition.Recognition;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 同义词功能
|
||||
*
|
||||
* @author Ansj
|
||||
*
|
||||
*/
|
||||
public class SynonymsRecgnition implements Recognition {
|
||||
|
||||
private static final long serialVersionUID = 5961499108093950130L;
|
||||
|
||||
private SmartForest<List<String>> synonyms = null;
|
||||
|
||||
public SynonymsRecgnition() {
|
||||
this.synonyms = SynonymsLibrary.get();
|
||||
}
|
||||
|
||||
public SynonymsRecgnition(String key) {
|
||||
this.synonyms = SynonymsLibrary.get(key);
|
||||
}
|
||||
|
||||
public SynonymsRecgnition(SmartForest<List<String>> synonyms) {
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
for (Term term : result) {
|
||||
SmartForest<List<String>> branch = synonyms.getBranch(term.getName());
|
||||
if (branch != null && branch.getStatus() > 1) {
|
||||
List<String> syns = branch.getParam();
|
||||
if (syns != null) {
|
||||
term.setSynonyms(syns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,96 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.Recognition;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* 时间识别抽取
|
||||
*
|
||||
* @author sunyang
|
||||
*
|
||||
*/
|
||||
public class TimeRecognition implements Recognition {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
private static final Nature nature = new Nature("t");
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
String name = "";
|
||||
String timeWord = "";
|
||||
List<Term> terms = result.getTerms();
|
||||
LinkedList<Term> mergeList = new LinkedList<>();
|
||||
List<Term> list = new LinkedList<>();
|
||||
|
||||
Pattern pattern =
|
||||
Pattern.compile("((\\d|[0123456789]){1,4}年(\\d|[0123456789]){1,2}月(\\d|[0123456789]){1,2}[日|号](上午|下午|中午|晚)?(\\s)*((\\d|[0123456789]){1,2}([点|时|點|時])?((:)?(\\d|[0123456789]){1,2}(分)?((:)?(\\d|[0123456789]){1,2}(秒)?)?)?)?(\\s)*(PM|AM)?|(\\d|[0123456789]){1,2}(月|月份)(\\d|[0123456789]){1,2}([日|号])?(上午|下午|中午|晚)?(\\s)*((\\d|[0123456789]){1,2}([点|时|點|時])?((:)?(\\d|[0123456789]){1,2}(分)?((:)?(\\d|[0123456789]){1,2}(秒)?)?)?)?(\\s)*(PM|AM)?|(\\d|[0123456789]){1,2}日(上午|下午|中午|晚)?(\\s)*((\\d|[0123456789]){1,2}([点|时|點|時])?((:)?(\\d|[0123456789]){1,2}(分)?((:)?(\\d|[0123456789]){1,2}(秒)?)?)?)?(\\s)*(PM|AM)?|(昨天|昨日|昨日上午|昨日下午|昨日晚上|昨天早上|昨天上午|昨天中午|昨天下午|昨晚|昨夜|昨天晚上|今天早上|今天上午|今天下午|今晚|今天晚上|今日上午|今日下午|今日|今天|前天|今年|去年|当日|当日上午|上午|下午|中午|清晨|前晚|早上|凌晨|今晨|近日|日前|不久前)((\\d|[0123456789]){1,2}[点|时|點|時])?((:)?(\\d|[0123456789]){1,2}(分)?((:)?(\\d|[0123456789]){1,2}(秒)?)?)?(\\s)*(PM|AM)?|[\\“|\"](1|2|3|4|5|6|7|8|9|10|11|12)[·|.| |-](\\d|[0123456789]){1,2}[\\”|\"]|星期[一|二|三|四|五|六|天|日]|(\\d|[0123456789]){1,2}[点|时|點|時]((:)?(\\d|[0123456789]){1,2}(分)?((:)?(\\d|[0123456789]){1,2}(秒)?)?)?(\\s)*(PM|AM)?|(\\d|[0123456789]){4}年((\\d|[0123456789]){1,2}月)?|(\\d|[0123456789]){1,2}月|(正|一|二|三|四|五|六|七|八|九|十|十一|十二|腊)月((初|十|二十|三十)[ 一二三四五六七八九十])?(上午|下午|中午|晚)?|((\\d|[0123456789]){4}-(\\d|[0123456789]){2}-(\\d|[0123456789]){2})?(\\s)*(\\d|[0123456789]){2}:(\\d|[0123456789]){2}:(\\d|[0123456789]){2}|(\\d|[0123456789]){4}-(\\d|[0123456789]){2}-(\\d|[0123456789]){2}(\\s)*((\\d|[0123456789]){2}:(\\d|[0123456789]){2}:(\\d|[0123456789]){2})?)",
|
||||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
|
||||
|
||||
for (int i = 0; i < terms.size(); i++) {
|
||||
boolean isTime = false;
|
||||
Term termBase = terms.get(i);
|
||||
int timeTermsLength = 1;
|
||||
int matchLength = 0; //匹配长度
|
||||
for (int j = i; j < terms.size() && matchLength < 11; j++) { //向后最大找14个词匹配是否是时间词
|
||||
Term term = terms.get(j);
|
||||
name = term.getName();
|
||||
timeWord += name;
|
||||
Matcher matcher = pattern.matcher(timeWord);
|
||||
mergeList.add(term);
|
||||
if (matcher.matches()) {
|
||||
isTime = true;
|
||||
timeTermsLength += (j - i);
|
||||
i = j;
|
||||
}
|
||||
matchLength++;
|
||||
}
|
||||
if (isTime) {
|
||||
Term ft = mergeList.pollFirst();
|
||||
for (int k = 0; k < timeTermsLength - 1; k++) {
|
||||
ft.merageWithBlank(mergeList.get(k));
|
||||
}
|
||||
ft.setNature(nature);
|
||||
list.add(ft);
|
||||
} else {
|
||||
list.add(termBase);
|
||||
}
|
||||
mergeList.clear();
|
||||
timeWord = "";
|
||||
|
||||
}
|
||||
result.setTerms(list);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -1,85 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.recognition.impl;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.library.DicLibrary;
|
||||
import org.ansj.recognition.Recognition;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
|
||||
/**
|
||||
* 用户自定义词典的词性优先
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class UserDicNatureRecognition implements Recognition {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
private Forest[] forests = null;
|
||||
|
||||
public UserDicNatureRecognition() {
|
||||
forests = new Forest[] {DicLibrary.get()};
|
||||
}
|
||||
|
||||
/**
|
||||
* 传入多本词典,后面的会覆盖前面的结果
|
||||
*
|
||||
* @param forests
|
||||
*/
|
||||
public UserDicNatureRecognition(Forest... forests) {
|
||||
this.forests = forests;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recognition(Result result) {
|
||||
for (Term term : result) {
|
||||
for (int i = forests.length - 1; i > -1; i--) {
|
||||
String[] params = getParams(forests[i], term.getName());
|
||||
if (params != null) {
|
||||
term.setNature(new Nature(params[0]));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String[] getParams(Forest forest, String word) {
|
||||
SmartForest<String[]> temp = forest;
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
temp = temp.get(word.charAt(i));
|
||||
if (temp == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
if (temp.getStatus() > 1) {
|
||||
return temp.getParam();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,353 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.domain.TermNature;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.library.AmbiguityLibrary;
|
||||
import org.ansj.library.DicLibrary;
|
||||
import org.ansj.splitWord.impl.GetWordsImpl;
|
||||
import org.ansj.util.AnsjReader;
|
||||
import org.ansj.util.Graph;
|
||||
import org.ansj.util.MyStaticValue;
|
||||
import org.nlpcn.commons.lang.tire.GetWord;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.WordAlert;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.ansj.library.DATDictionary.status;
|
||||
|
||||
/**
|
||||
* 基本分词+人名识别
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public abstract class Analysis {
|
||||
|
||||
/**
|
||||
* 用来记录偏移量
|
||||
*/
|
||||
public int offe;
|
||||
|
||||
/**
|
||||
* 分词的类
|
||||
*/
|
||||
private GetWordsImpl gwi = new GetWordsImpl();
|
||||
|
||||
protected Forest[] forests = null;
|
||||
|
||||
private Forest ambiguityForest = AmbiguityLibrary.get();
|
||||
|
||||
// 是否开启人名识别
|
||||
protected Boolean isNameRecognition = true;
|
||||
|
||||
// 是否开启数字识别
|
||||
protected Boolean isNumRecognition = true;
|
||||
|
||||
// 是否数字和量词合并
|
||||
protected Boolean isQuantifierRecognition = true;
|
||||
|
||||
// 是否显示真实词语
|
||||
protected Boolean isRealName = false;
|
||||
|
||||
/**
|
||||
* 文档读取流
|
||||
*/
|
||||
private AnsjReader br;
|
||||
|
||||
protected Analysis() {
|
||||
this.forests = new Forest[] {DicLibrary.get()};
|
||||
this.isNameRecognition = MyStaticValue.isNameRecognition;
|
||||
this.isNumRecognition = MyStaticValue.isNumRecognition;
|
||||
this.isQuantifierRecognition = MyStaticValue.isQuantifierRecognition;
|
||||
this.isRealName = MyStaticValue.isRealName;
|
||||
};
|
||||
|
||||
private LinkedList<Term> terms = new LinkedList<>();
|
||||
|
||||
/**
|
||||
* while 循环调用.直到返回为null则分词结束
|
||||
*
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
|
||||
public Term next() throws IOException {
|
||||
Term term = null;
|
||||
if (!terms.isEmpty()) {
|
||||
term = terms.poll();
|
||||
term.updateOffe(offe);
|
||||
return term;
|
||||
}
|
||||
|
||||
String temp = br.readLine();
|
||||
offe = br.getStart();
|
||||
while (StringUtil.isBlank(temp)) {
|
||||
if (temp == null) {
|
||||
return null;
|
||||
} else {
|
||||
temp = br.readLine();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 歧异处理字符串
|
||||
|
||||
fullTerms(temp);
|
||||
|
||||
if (!terms.isEmpty()) {
|
||||
term = terms.poll();
|
||||
term.updateOffe(offe);
|
||||
return term;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 填充terms
|
||||
*/
|
||||
private void fullTerms(String temp) {
|
||||
List<Term> result = analysisStr(temp);
|
||||
terms.addAll(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* 一整句话分词,用户设置的歧异优先
|
||||
*
|
||||
* @param temp
|
||||
* @return
|
||||
*/
|
||||
private List<Term> analysisStr(String temp) {
|
||||
Graph gp = new Graph(temp);
|
||||
int startOffe = 0;
|
||||
|
||||
if (this.ambiguityForest != null) {
|
||||
GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
|
||||
String[] params = null;
|
||||
while ((gw.getFrontWords()) != null) {
|
||||
if (gw.offe > startOffe) {
|
||||
analysis(gp, startOffe, gw.offe);
|
||||
}
|
||||
params = gw.getParams();
|
||||
startOffe = gw.offe;
|
||||
for (int i = 0; i < params.length; i += 2) {
|
||||
gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
|
||||
startOffe += params[i].length();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (startOffe < gp.chars.length) {
|
||||
analysis(gp, startOffe, gp.chars.length);
|
||||
}
|
||||
List<Term> result = this.getResult(gp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private void analysis(Graph gp, int startOffe, int endOffe) {
|
||||
int start = 0;
|
||||
int end = 0;
|
||||
char[] chars = gp.chars;
|
||||
|
||||
String str = null;
|
||||
for (int i = startOffe; i < endOffe; i++) {
|
||||
switch (status(chars[i])) {
|
||||
case 4:
|
||||
start = i;
|
||||
end = 1;
|
||||
while (++i < endOffe && status(chars[i]) == 4) {
|
||||
end++;
|
||||
}
|
||||
str = WordAlert.alertEnglish(chars, start, end);
|
||||
gp.addTerm(new Term(str, start, TermNatures.EN));
|
||||
i--;
|
||||
break;
|
||||
case 5:
|
||||
start = i;
|
||||
end = 1;
|
||||
while (++i < endOffe && status(chars[i]) == 5) {
|
||||
end++;
|
||||
}
|
||||
str = WordAlert.alertNumber(chars, start, end);
|
||||
gp.addTerm(new Term(str, start, TermNatures.M));
|
||||
i--;
|
||||
break;
|
||||
default:
|
||||
start = i;
|
||||
end = i;
|
||||
|
||||
int status = 0;
|
||||
do {
|
||||
end = ++i;
|
||||
if (i >= endOffe) {
|
||||
break;
|
||||
}
|
||||
status = status(chars[i]);
|
||||
} while (status < 4);
|
||||
|
||||
if (status > 3) {
|
||||
i--;
|
||||
}
|
||||
|
||||
gwi.setChars(chars, start, end);
|
||||
int max = start;
|
||||
while ((str = gwi.allWords()) != null) {
|
||||
Term term = new Term(str, gwi.offe, gwi.getItem());
|
||||
int len = term.getOffe() - max;
|
||||
if (len > 0) {
|
||||
for (; max < term.getOffe();) {
|
||||
gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
|
||||
max++;
|
||||
}
|
||||
}
|
||||
gp.addTerm(term);
|
||||
max = term.toValue();
|
||||
}
|
||||
|
||||
int len = end - max;
|
||||
if (len > 0) {
|
||||
for (; max < end;) {
|
||||
gp.addTerm(new Term(String.valueOf(chars[max]), max, TermNatures.NULL));
|
||||
max++;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将为标准化的词语设置到分词中
|
||||
*
|
||||
* @param gp
|
||||
* @param result
|
||||
*/
|
||||
protected void setRealName(Graph graph, List<Term> result) {
|
||||
|
||||
if (!MyStaticValue.isRealName) {
|
||||
return;
|
||||
}
|
||||
|
||||
String str = graph.realStr;
|
||||
|
||||
for (Term term : result) {
|
||||
term.setRealName(str.substring(term.getOffe(), term.getOffe() + term.getName().length()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 一句话进行分词并且封装
|
||||
*
|
||||
* @param temp
|
||||
* @return
|
||||
*/
|
||||
public Result parseStr(String temp) {
|
||||
return new Result(analysisStr(temp));
|
||||
}
|
||||
|
||||
/**
|
||||
* 通过构造方法传入的reader直接获取到分词结果
|
||||
*
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public Result parse() throws IOException {
|
||||
List<Term> list = new ArrayList<>();
|
||||
Term temp = null;
|
||||
while ((temp = next()) != null) {
|
||||
list.add(temp);
|
||||
}
|
||||
Result result = new Result(list);
|
||||
return result;
|
||||
}
|
||||
|
||||
protected abstract List<Term> getResult(Graph graph);
|
||||
|
||||
public abstract class Merger {
|
||||
public abstract List<Term> merger();
|
||||
}
|
||||
|
||||
/**
|
||||
* 重置分词器
|
||||
*
|
||||
* @param br
|
||||
*/
|
||||
public void resetContent(AnsjReader br) {
|
||||
this.offe = 0;
|
||||
this.br = br;
|
||||
}
|
||||
|
||||
public void resetContent(Reader reader) {
|
||||
this.offe = 0;
|
||||
this.br = new AnsjReader(reader);
|
||||
}
|
||||
|
||||
public void resetContent(Reader reader, int buffer) {
|
||||
this.offe = 0;
|
||||
this.br = new AnsjReader(reader, buffer);
|
||||
}
|
||||
|
||||
public Forest getAmbiguityForest() {
|
||||
return ambiguityForest;
|
||||
}
|
||||
|
||||
public Analysis setAmbiguityForest(Forest ambiguityForest) {
|
||||
this.ambiguityForest = ambiguityForest;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Analysis setForests(Forest... forests) {
|
||||
this.forests = forests;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Analysis setIsNameRecognition(Boolean isNameRecognition) {
|
||||
this.isNameRecognition = isNameRecognition;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Analysis setIsNumRecognition(Boolean isNumRecognition) {
|
||||
this.isNumRecognition = isNumRecognition;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Analysis setIsQuantifierRecognition(Boolean isQuantifierRecognition) {
|
||||
this.isQuantifierRecognition = isQuantifierRecognition;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Analysis setIsRealName(Boolean isRealName) {
|
||||
this.isRealName = isRealName;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord;
|
||||
|
||||
public interface GetWords {
|
||||
/**
|
||||
* 全文全词全匹配
|
||||
*
|
||||
* @param str
|
||||
* 传入的需要分词的句子
|
||||
* @return 返还分完词后的句子
|
||||
*/
|
||||
public String allWords();
|
||||
|
||||
/**
|
||||
* 同一个对象传入词语
|
||||
*
|
||||
* @param temp
|
||||
* 传入的句子
|
||||
*/
|
||||
public void setStr(String temp);
|
||||
|
||||
/**
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
|
||||
public void setChars(char[] chars, int start, int end);
|
||||
|
||||
public int getOffe();
|
||||
}
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord.analysis;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.splitWord.Analysis;
|
||||
import org.ansj.util.AnsjReader;
|
||||
import org.ansj.util.Graph;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 基本的分词.只做了.ngram模型.和数字发现.其他一律不管
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class BaseAnalysis extends Analysis {
|
||||
|
||||
@Override
|
||||
protected List<Term> getResult(final Graph graph) {
|
||||
Merger merger = new Merger() {
|
||||
@Override
|
||||
public List<Term> merger() {
|
||||
graph.walkPath();
|
||||
return getResult();
|
||||
}
|
||||
|
||||
private List<Term> getResult() {
|
||||
List<Term> result = new ArrayList<>();
|
||||
int length = graph.terms.length - 1;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (graph.terms[i] != null) {
|
||||
result.add(graph.terms[i]);
|
||||
}
|
||||
}
|
||||
|
||||
setRealName(graph, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
return merger.merger();
|
||||
}
|
||||
|
||||
public BaseAnalysis() {};
|
||||
|
||||
public BaseAnalysis(Reader reader) {
|
||||
super.resetContent(new AnsjReader(reader));
|
||||
}
|
||||
|
||||
public static Result parse(String str) {
|
||||
return new BaseAnalysis().parseStr(str);
|
||||
}
|
||||
}
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord.analysis;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.NumRecognition;
|
||||
import org.ansj.splitWord.Analysis;
|
||||
import org.ansj.util.AnsjReader;
|
||||
import org.ansj.util.Graph;
|
||||
import org.ansj.util.NameFix;
|
||||
import org.ansj.util.TermUtil;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
import org.nlpcn.commons.lang.tire.GetWord;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 默认用户自定义词性优先
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class DicAnalysis extends Analysis {
|
||||
|
||||
@Override
|
||||
protected List<Term> getResult(final Graph graph) {
|
||||
|
||||
Merger merger = new Merger() {
|
||||
@Override
|
||||
public List<Term> merger() {
|
||||
|
||||
// 用户自定义词典的识别
|
||||
userDefineRecognition(graph, forests);
|
||||
|
||||
graph.walkPath();
|
||||
|
||||
// 数字发现
|
||||
if (isNumRecognition && graph.hasNum) {
|
||||
new NumRecognition().recognition(graph.terms);
|
||||
}
|
||||
|
||||
// 姓名识别
|
||||
if (graph.hasPerson && isNameRecognition) {
|
||||
// 亚洲人名识别
|
||||
new AsianPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
NameFix.nameAmbiguity(graph.terms);
|
||||
// 外国人名识别
|
||||
new ForeignPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
}
|
||||
|
||||
return getResult();
|
||||
}
|
||||
|
||||
private void userDefineRecognition(final Graph graph, Forest... forests) {
|
||||
|
||||
if (forests == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
int beginOff = graph.terms[0].getOffe();
|
||||
|
||||
Forest forest = null;
|
||||
for (int i = forests.length - 1; i >= 0; i--) {
|
||||
forest = forests[i];
|
||||
if (forest == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
GetWord word = forest.getWord(graph.chars);
|
||||
String temp = null;
|
||||
int tempFreq = 50;
|
||||
while ((temp = word.getAllWords()) != null) {
|
||||
if (graph.terms[word.offe] == null) {
|
||||
continue;
|
||||
}
|
||||
tempFreq = getInt(word.getParam()[1], 50);
|
||||
Term term = new Term(temp, beginOff + word.offe, word.getParam()[0], tempFreq);
|
||||
term.selfScore(-1 * Math.pow(Math.log(tempFreq), temp.length()));
|
||||
TermUtil.insertTerm(graph.terms, term, InsertTermType.REPLACE);
|
||||
}
|
||||
}
|
||||
graph.rmLittlePath();
|
||||
graph.walkPathByScore();
|
||||
graph.rmLittlePath();
|
||||
}
|
||||
|
||||
private int getInt(String str, int def) {
|
||||
try {
|
||||
return Integer.parseInt(str);
|
||||
} catch (NumberFormatException e) {
|
||||
return def;
|
||||
}
|
||||
}
|
||||
|
||||
private List<Term> getResult() {
|
||||
|
||||
List<Term> result = new ArrayList<>();
|
||||
int length = graph.terms.length - 1;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (graph.terms[i] != null) {
|
||||
result.add(graph.terms[i]);
|
||||
}
|
||||
}
|
||||
setRealName(graph, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
return merger.merger();
|
||||
}
|
||||
|
||||
public DicAnalysis() {
|
||||
super();
|
||||
}
|
||||
|
||||
public DicAnalysis(Reader reader) {
|
||||
super.resetContent(new AnsjReader(reader));
|
||||
}
|
||||
|
||||
public static Result parse(String str) {
|
||||
return new DicAnalysis().parseStr(str);
|
||||
}
|
||||
|
||||
public static Result parse(String str, Forest... forests) {
|
||||
return new DicAnalysis().setForests(forests).parseStr(str);
|
||||
}
|
||||
}
|
|
@ -1,163 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord.analysis;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.NumRecognition;
|
||||
import org.ansj.recognition.arrimpl.UserDefineRecognition;
|
||||
import org.ansj.splitWord.Analysis;
|
||||
import org.ansj.util.AnsjReader;
|
||||
import org.ansj.util.Graph;
|
||||
import org.ansj.util.NameFix;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
import org.nlpcn.commons.lang.tire.GetWord;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.util.ObjConver;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* 用于检索的分词方式
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class IndexAnalysis extends Analysis {
|
||||
|
||||
@Override
|
||||
protected List<Term> getResult(final Graph graph) {
|
||||
Merger merger = new Merger() {
|
||||
|
||||
@Override
|
||||
public List<Term> merger() {
|
||||
graph.walkPath();
|
||||
|
||||
// 数字发现
|
||||
if (isNumRecognition && graph.hasNum) {
|
||||
new NumRecognition().recognition(graph.terms);
|
||||
}
|
||||
|
||||
// 姓名识别
|
||||
if (graph.hasPerson && isNameRecognition) {
|
||||
// 亚洲人名识别
|
||||
new AsianPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
NameFix.nameAmbiguity(graph.terms);
|
||||
// 外国人名识别
|
||||
new ForeignPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
}
|
||||
|
||||
// 用户自定义词典的识别
|
||||
userDefineRecognition(graph, forests);
|
||||
|
||||
return result();
|
||||
}
|
||||
|
||||
private void userDefineRecognition(final Graph graph, Forest... forests) {
|
||||
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
|
||||
graph.rmLittlePath();
|
||||
graph.walkPathByScore();
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索的分词
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private List<Term> result() {
|
||||
|
||||
String temp = null;
|
||||
|
||||
Set<String> set = new HashSet<>();
|
||||
|
||||
List<Term> result = new LinkedList<>();
|
||||
int length = graph.terms.length - 1;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (graph.terms[i] != null) {
|
||||
result.add(graph.terms[i]);
|
||||
set.add(graph.terms[i].getName() + graph.terms[i].getOffe());
|
||||
}
|
||||
}
|
||||
|
||||
LinkedList<Term> last = new LinkedList<>();
|
||||
|
||||
char[] chars = graph.chars;
|
||||
|
||||
if (forests != null) {
|
||||
for (Forest forest : forests) {
|
||||
if (forest == null) {
|
||||
continue;
|
||||
}
|
||||
GetWord word = forest.getWord(chars);
|
||||
while ((temp = word.getAllWords()) != null) {
|
||||
if (!set.contains(temp + word.offe)) {
|
||||
set.add(temp + word.offe);
|
||||
last.add(new Term(temp, word.offe, word.getParam(0),
|
||||
ObjConver.getIntValue(word.getParam(1))));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.addAll(last);
|
||||
|
||||
Collections.sort(result, new Comparator<Term>() {
|
||||
|
||||
@Override
|
||||
public int compare(Term o1, Term o2) {
|
||||
if (o1.getOffe() == o2.getOffe()) {
|
||||
return o2.getName().length() - o1.getName().length();
|
||||
} else {
|
||||
return o1.getOffe() - o2.getOffe();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
setRealName(graph, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
return merger.merger();
|
||||
}
|
||||
|
||||
public IndexAnalysis() {
|
||||
super();
|
||||
}
|
||||
|
||||
public IndexAnalysis(Reader reader) {
|
||||
super.resetContent(new AnsjReader(reader));
|
||||
}
|
||||
|
||||
public static Result parse(String str) {
|
||||
return new IndexAnalysis().parseStr(str);
|
||||
}
|
||||
|
||||
public static Result parse(String str, Forest... forests) {
|
||||
return new IndexAnalysis().setForests(forests).parseStr(str);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,288 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord.analysis;
|
||||
|
||||
import org.ansj.app.crf.SplitWord;
|
||||
import org.ansj.dic.LearnTool;
|
||||
import org.ansj.domain.*;
|
||||
import org.ansj.library.CrfLibrary;
|
||||
import org.ansj.recognition.arrimpl.*;
|
||||
import org.ansj.recognition.impl.NatureRecognition;
|
||||
import org.ansj.splitWord.Analysis;
|
||||
import org.ansj.util.AnsjReader;
|
||||
import org.ansj.util.Graph;
|
||||
import org.ansj.util.NameFix;
|
||||
import org.ansj.util.TermUtil;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.util.MapCount;
|
||||
import org.nlpcn.commons.lang.util.WordAlert;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 自然语言分词,具有未登录词发现功能。建议在自然语言理解中用。搜索中不要用
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class NlpAnalysis extends Analysis {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(NlpAnalysis.class);
|
||||
|
||||
private LearnTool learn = null;
|
||||
|
||||
private static final String TAB = "\t";
|
||||
|
||||
private static final int CRF_WEIGHT = 6;
|
||||
|
||||
private SplitWord splitWord = CrfLibrary.get();
|
||||
|
||||
@Override
|
||||
protected List<Term> getResult(final Graph graph) {
|
||||
|
||||
Merger merger = new Merger() {
|
||||
@Override
|
||||
public List<Term> merger() {
|
||||
|
||||
if (learn == null) {
|
||||
learn = new LearnTool();
|
||||
}
|
||||
|
||||
graph.walkPath();
|
||||
|
||||
learn.learn(graph, splitWord, forests);
|
||||
|
||||
// 姓名识别
|
||||
if (graph.hasPerson && isNameRecognition) {
|
||||
// 亚洲人名识别
|
||||
new AsianPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
NameFix.nameAmbiguity(graph.terms);
|
||||
// 外国人名识别
|
||||
new ForeignPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
}
|
||||
|
||||
if (splitWord != null) {
|
||||
MapCount<String> mc = new MapCount<>();
|
||||
|
||||
// 通过crf分词
|
||||
List<String> words = splitWord.cut(graph.chars);
|
||||
|
||||
Term tempTerm = null;
|
||||
|
||||
int tempOff = 0;
|
||||
|
||||
if (!words.isEmpty()) {
|
||||
String word = words.get(0);
|
||||
if (!isRuleWord(word)) {
|
||||
mc.add("始##始" + TAB + word, CRF_WEIGHT);
|
||||
}
|
||||
}
|
||||
|
||||
for (String word : words) {
|
||||
|
||||
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word); // 尝试从词典获取词性
|
||||
|
||||
Term term = null;
|
||||
|
||||
if (termNatures != TermNatures.NULL) {
|
||||
term = new Term(word, tempOff, termNatures);
|
||||
} else {
|
||||
term = new Term(word, tempOff, TermNatures.NW);
|
||||
term.setNewWord(true);
|
||||
}
|
||||
|
||||
tempOff += word.length(); // 增加偏移量
|
||||
if (isRuleWord(word)) { // 如果word不对那么不要了
|
||||
tempTerm = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (term.isNewWord()) { // 尝试猜测词性
|
||||
termNatures = NatureRecognition.guessNature(word);
|
||||
term.updateTermNaturesAndNature(termNatures);
|
||||
}
|
||||
|
||||
TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
|
||||
|
||||
// 对于非词典中的词持有保守态度
|
||||
if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
|
||||
mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
|
||||
}
|
||||
|
||||
tempTerm = term;
|
||||
|
||||
if (term.isNewWord()) {
|
||||
learn.addTerm(new NewWord(word, Nature.NW));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (tempTerm != null && !tempTerm.isNewWord()) {
|
||||
mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
|
||||
}
|
||||
graph.walkPath(mc.get());
|
||||
} else {
|
||||
LOG.warn("not find any crf model, make sure your config right? ");
|
||||
}
|
||||
|
||||
// 数字发现
|
||||
if (graph.hasNum && isNumRecognition) {
|
||||
new NumRecognition().recognition(graph.terms);
|
||||
}
|
||||
|
||||
// 词性标注
|
||||
List<Term> result = getResult();
|
||||
|
||||
// 用户自定义词典的识别
|
||||
new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
|
||||
graph.rmLittlePath();
|
||||
graph.walkPathByScore();
|
||||
|
||||
// 进行新词发现
|
||||
new NewWordRecognition(learn).recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
|
||||
// 优化后重新获得最优路径
|
||||
result = getResult();
|
||||
|
||||
// 激活辞典
|
||||
for (Term term : result) {
|
||||
learn.active(term.getName());
|
||||
}
|
||||
|
||||
setRealName(graph, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private List<Term> getResult() {
|
||||
|
||||
List<Term> result = new ArrayList<>();
|
||||
int length = graph.terms.length - 1;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (graph.terms[i] == null) {
|
||||
continue;
|
||||
}
|
||||
result.add(graph.terms[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
return merger.merger();
|
||||
}
|
||||
|
||||
// 临时处理新词中的特殊字符
|
||||
private static final Set<Character> filter = new HashSet<>();
|
||||
|
||||
static {
|
||||
filter.add(':');
|
||||
filter.add(' ');
|
||||
filter.add(':');
|
||||
filter.add(' ');
|
||||
filter.add(',');
|
||||
filter.add('”');
|
||||
filter.add('“');
|
||||
filter.add('?');
|
||||
filter.add('。');
|
||||
filter.add('!');
|
||||
filter.add('。');
|
||||
filter.add(',');
|
||||
filter.add('.');
|
||||
filter.add('、');
|
||||
filter.add('\\');
|
||||
filter.add(';');
|
||||
filter.add(';');
|
||||
filter.add('?');
|
||||
filter.add('?');
|
||||
filter.add('!');
|
||||
filter.add('\"');
|
||||
filter.add('(');
|
||||
filter.add(')');
|
||||
filter.add('(');
|
||||
filter.add(')');
|
||||
filter.add('…');
|
||||
filter.add('…');
|
||||
filter.add('—');
|
||||
filter.add('-');
|
||||
filter.add('-');
|
||||
|
||||
filter.add('—');
|
||||
filter.add('《');
|
||||
filter.add('》');
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断新词识别出来的词是否可信
|
||||
*
|
||||
* @param word
|
||||
* @return
|
||||
*/
|
||||
public static boolean isRuleWord(String word) {
|
||||
char c = 0;
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
c = word.charAt(i);
|
||||
|
||||
if (c != '·') {
|
||||
if (c < 256 || filter.contains(c) || (c = WordAlert.CharCover(word.charAt(i))) > 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public NlpAnalysis setCrfModel(SplitWord splitWord) {
|
||||
this.splitWord = splitWord;
|
||||
return this;
|
||||
}
|
||||
|
||||
public NlpAnalysis setLearnTool(LearnTool learn) {
|
||||
this.learn = learn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public NlpAnalysis() {
|
||||
super();
|
||||
}
|
||||
|
||||
public NlpAnalysis(Reader reader) {
|
||||
super.resetContent(new AnsjReader(reader));
|
||||
}
|
||||
|
||||
public static Result parse(String str) {
|
||||
return new NlpAnalysis().parseStr(str);
|
||||
}
|
||||
|
||||
public static Result parse(String str, Forest... forests) {
|
||||
return new NlpAnalysis().setForests(forests).parseStr(str);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,116 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord.analysis;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
|
||||
import org.ansj.recognition.arrimpl.NumRecognition;
|
||||
import org.ansj.recognition.arrimpl.UserDefineRecognition;
|
||||
import org.ansj.splitWord.Analysis;
|
||||
import org.ansj.util.AnsjReader;
|
||||
import org.ansj.util.Graph;
|
||||
import org.ansj.util.NameFix;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 标准分词
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class ToAnalysis extends Analysis {
|
||||
|
||||
@Override
|
||||
protected List<Term> getResult(final Graph graph) {
|
||||
|
||||
Merger merger = new Merger() {
|
||||
@Override
|
||||
public List<Term> merger() {
|
||||
|
||||
graph.walkPath();
|
||||
|
||||
// 数字发现
|
||||
if (isNumRecognition && graph.hasNum) {
|
||||
new NumRecognition().recognition(graph.terms);
|
||||
}
|
||||
|
||||
// 姓名识别
|
||||
if (graph.hasPerson && isNameRecognition) {
|
||||
// 亚洲人名识别
|
||||
new AsianPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
NameFix.nameAmbiguity(graph.terms);
|
||||
// 外国人名识别
|
||||
new ForeignPersonRecognition().recognition(graph.terms);
|
||||
graph.walkPathByScore();
|
||||
}
|
||||
|
||||
// 用户自定义词典的识别
|
||||
userDefineRecognition(graph, forests);
|
||||
|
||||
return getResult();
|
||||
}
|
||||
|
||||
private void userDefineRecognition(final Graph graph, Forest... forests) {
|
||||
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
|
||||
graph.rmLittlePath();
|
||||
graph.walkPathByScore();
|
||||
}
|
||||
|
||||
private List<Term> getResult() {
|
||||
List<Term> result = new ArrayList<>();
|
||||
int length = graph.terms.length - 1;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (graph.terms[i] != null) {
|
||||
result.add(graph.terms[i]);
|
||||
}
|
||||
}
|
||||
setRealName(graph, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
return merger.merger();
|
||||
}
|
||||
|
||||
public ToAnalysis() {
|
||||
super();
|
||||
}
|
||||
|
||||
public ToAnalysis(Reader reader) {
|
||||
super.resetContent(new AnsjReader(reader));
|
||||
}
|
||||
|
||||
public static Result parse(String str) {
|
||||
return new ToAnalysis().parseStr(str);
|
||||
}
|
||||
|
||||
public static Result parse(String str, Forest... forests) {
|
||||
return new ToAnalysis().setForests(forests).parseStr(str);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,149 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.splitWord.impl;
|
||||
|
||||
import org.ansj.domain.AnsjItem;
|
||||
import org.ansj.library.DATDictionary;
|
||||
import org.ansj.splitWord.GetWords;
|
||||
|
||||
public class GetWordsImpl implements GetWords {
|
||||
|
||||
/**
|
||||
* offe : 当前词的偏移量
|
||||
*/
|
||||
public int offe;
|
||||
|
||||
/**
|
||||
* 构造方法,同时加载词典,传入词语相当于同时调用了setStr() ;
|
||||
*/
|
||||
public GetWordsImpl(String str) {
|
||||
setStr(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* 构造方法,同时加载词典
|
||||
*/
|
||||
public GetWordsImpl() {}
|
||||
|
||||
int charsLength = 0;
|
||||
|
||||
@Override
|
||||
public void setStr(String str) {
|
||||
setChars(str.toCharArray(), 0, str.length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setChars(char[] chars, int start, int end) {
|
||||
this.chars = chars;
|
||||
i = start;
|
||||
this.start = start;
|
||||
charsLength = end;
|
||||
checkValue = 0;
|
||||
}
|
||||
|
||||
public char[] chars;
|
||||
private int charHashCode;
|
||||
private int start = 0;
|
||||
public int end = 0;
|
||||
private int baseValue = 0;
|
||||
private int checkValue = 0;
|
||||
private int tempBaseValue = 0;
|
||||
public int i = 0;
|
||||
private String str = null;
|
||||
|
||||
@Override
|
||||
public String allWords() {
|
||||
for (; i < charsLength; i++) {
|
||||
charHashCode = chars[i];
|
||||
end++;
|
||||
switch (getStatement()) {
|
||||
case 0:
|
||||
if (baseValue == chars[i]) {
|
||||
str = String.valueOf(chars[i]);
|
||||
offe = i;
|
||||
start = ++i;
|
||||
end = 0;
|
||||
baseValue = 0;
|
||||
tempBaseValue = baseValue;
|
||||
return str;
|
||||
} else {
|
||||
int startCharStatus = DATDictionary.getItem(chars[start]).getStatus();
|
||||
if (startCharStatus == 1) { //如果start的词的status为1,则将start设为i;否则start加1
|
||||
start = i;
|
||||
i--;
|
||||
end = 0;
|
||||
baseValue = 0;
|
||||
} else {
|
||||
i = start;
|
||||
start++;
|
||||
end = 0;
|
||||
baseValue = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
i++;
|
||||
offe = start;
|
||||
tempBaseValue = baseValue;
|
||||
return DATDictionary.getItem(tempBaseValue).getName();
|
||||
case 3:
|
||||
offe = start;
|
||||
start++;
|
||||
i = start;
|
||||
end = 0;
|
||||
tempBaseValue = baseValue;
|
||||
baseValue = 0;
|
||||
return DATDictionary.getItem(tempBaseValue).getName();
|
||||
}
|
||||
|
||||
}
|
||||
end = 0;
|
||||
baseValue = 0;
|
||||
i = 0;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据用户传入的c得到单词的状态. 0.代表这个字不在词典中 1.继续 2.是个词但是还可以继续 3.停止已经是个词了
|
||||
*
|
||||
* @param c
|
||||
* @return
|
||||
*/
|
||||
private int getStatement() {
|
||||
checkValue = baseValue;
|
||||
baseValue = DATDictionary.getItem(checkValue).getBase() + charHashCode;
|
||||
if (baseValue < DATDictionary.arrayLength && (DATDictionary.getItem(baseValue).getCheck() == checkValue
|
||||
|| DATDictionary.getItem(baseValue).getCheck() == -1)) {
|
||||
return DATDictionary.getItem(baseValue).getStatus();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public AnsjItem getItem() {
|
||||
return DATDictionary.getItem(tempBaseValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffe() {
|
||||
return offe;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,240 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* 我又剽窃了下jdk...职业嫖客 为了效率这个流的操作是不支持多线程的,要么就是长时间不写这种东西了。发现好费劲啊 这个reader的特点。。只会输入
|
||||
* 句子不会输出\r\n .会有一个start来记录当前返回字符串。起始偏移量
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class AnsjReader extends Reader {
|
||||
|
||||
private Reader in;
|
||||
|
||||
private char cb[];
|
||||
|
||||
private static int defaultCharBufferSize = 8192;
|
||||
|
||||
/**
|
||||
* Creates a buffering character-input stream that uses an input buffer of
|
||||
* the specified size.
|
||||
*
|
||||
* @param in
|
||||
* A Reader
|
||||
* @param sz
|
||||
* Input-buffer size
|
||||
*
|
||||
* @exception IllegalArgumentException
|
||||
* If {@code sz <= 0}
|
||||
*/
|
||||
public AnsjReader(Reader in, int sz) {
|
||||
super(in);
|
||||
if (sz <= 0)
|
||||
throw new IllegalArgumentException("Buffer size <= 0");
|
||||
this.in = in;
|
||||
cb = new char[sz];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a buffering character-input stream that uses a default-sized
|
||||
* input buffer.
|
||||
*
|
||||
* @param in
|
||||
* A Reader
|
||||
*/
|
||||
public AnsjReader(Reader in) {
|
||||
this(in, defaultCharBufferSize);
|
||||
}
|
||||
|
||||
/** Checks to make sure that the stream has not been closed */
|
||||
private void ensureOpen() throws IOException {
|
||||
if (in == null)
|
||||
throw new IOException("Stream closed");
|
||||
}
|
||||
|
||||
/**
|
||||
* 为了功能的单一性我还是不实现了
|
||||
*/
|
||||
@Override
|
||||
public int read(char cbuf[], int off, int len) throws IOException {
|
||||
throw new IOException("AnsjBufferedReader not support this interface! ");
|
||||
}
|
||||
|
||||
private int start = 0;
|
||||
private int tempStart = 0;
|
||||
|
||||
/**
|
||||
* 读取一行数据。ps 读取结果会忽略 \n \r
|
||||
*/
|
||||
public String readLine() throws IOException {
|
||||
|
||||
ensureOpen();
|
||||
|
||||
StringBuilder sb = null;
|
||||
|
||||
start = tempStart;
|
||||
|
||||
firstRead = true;
|
||||
|
||||
while (true) {
|
||||
|
||||
tempLen = 0;
|
||||
ok = false;
|
||||
|
||||
readString();
|
||||
// if (tempLen != 0)
|
||||
// System.out.println(new String(cb, tempOffe, tempLen));
|
||||
|
||||
if (!isRead && (tempLen == 0 || len == 0)) {
|
||||
if (sb != null) {
|
||||
return sb.toString();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isRead) { // 如果不是需要读状态,那么返回
|
||||
tempStart += tempLen;
|
||||
if (sb == null) {
|
||||
return new String(cb, tempOffe, tempLen);
|
||||
} else {
|
||||
sb.append(cb, tempOffe, tempLen);
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
if (tempLen == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 如果是需要读状态那么读取
|
||||
if (sb == null) {
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
sb.append(cb, tempOffe, tempLen);
|
||||
tempStart += tempLen;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int offe = 0;
|
||||
int len = 0;
|
||||
|
||||
boolean isRead = false;
|
||||
boolean ok = false;
|
||||
boolean firstRead = true;
|
||||
|
||||
int tempOffe;
|
||||
int tempLen;
|
||||
|
||||
private void readString() throws IOException {
|
||||
|
||||
if (offe <= 0) {
|
||||
if (offe == -1) {
|
||||
isRead = false;
|
||||
return;
|
||||
}
|
||||
|
||||
len = in.read(cb);
|
||||
if (len <= 0) { // 说明到结尾了
|
||||
isRead = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
isRead = true;
|
||||
|
||||
char c = 0;
|
||||
int i = offe;
|
||||
for (; i < len; i++) {
|
||||
c = cb[i];
|
||||
if (c != '\r' && c != '\n') {
|
||||
break;
|
||||
}
|
||||
if (!firstRead) {
|
||||
i++;
|
||||
tempStart++;
|
||||
offe = i;
|
||||
tempOffe = offe;
|
||||
isRead = false;
|
||||
return;
|
||||
}
|
||||
tempStart++;
|
||||
start++;
|
||||
}
|
||||
|
||||
if (i == len) {
|
||||
isRead = true;
|
||||
offe = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
firstRead = false;
|
||||
|
||||
offe = i;
|
||||
|
||||
for (; i < len; i++) {
|
||||
c = cb[i];
|
||||
if (c == '\n' || c == '\r') {
|
||||
isRead = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tempOffe = offe;
|
||||
tempLen = i - offe;
|
||||
|
||||
if (i == len) {
|
||||
if (len < cb.length) { // 说明到结尾了
|
||||
isRead = false;
|
||||
offe = -1;
|
||||
} else {
|
||||
offe = 0;
|
||||
}
|
||||
} else {
|
||||
offe = i;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
synchronized (lock) {
|
||||
if (in == null)
|
||||
return;
|
||||
try {
|
||||
in.close();
|
||||
} finally {
|
||||
in = null;
|
||||
cb = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int getStart() {
|
||||
return this.start;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,372 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
import org.ansj.domain.AnsjItem;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.library.DATDictionary;
|
||||
import org.ansj.splitWord.Analysis.Merger;
|
||||
import org.ansj.util.TermUtil.InsertTermType;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 最短路径
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class Graph {
|
||||
public char[] chars = null;
|
||||
public String realStr = null;
|
||||
public Term[] terms = null;
|
||||
protected Term end = null;
|
||||
protected Term root = null;
|
||||
protected static final String E = "末##末";
|
||||
protected static final String B = "始##始";
|
||||
// 是否有人名
|
||||
public boolean hasPerson;
|
||||
// 是否有数字
|
||||
public boolean hasNum;
|
||||
|
||||
// 是否需有歧异
|
||||
|
||||
public Graph(String str) {
|
||||
realStr = str;
|
||||
this.chars = str.toCharArray();
|
||||
terms = new Term[chars.length + 1];
|
||||
end = new Term(E, chars.length, AnsjItem.END);
|
||||
root = new Term(B, -1, AnsjItem.BEGIN);
|
||||
terms[chars.length] = end;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建最优路径
|
||||
*/
|
||||
public List<Term> getResult(Merger merger) {
|
||||
return merger.merger();
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加一个词语到图中
|
||||
*
|
||||
* @param term
|
||||
*/
|
||||
public void addTerm(Term term) {
|
||||
// 是否有数字
|
||||
if (!hasNum && term.termNatures().numAttr.numFreq > 0) {
|
||||
hasNum = true;
|
||||
}
|
||||
// 是否有人名
|
||||
if (!hasPerson && term.termNatures().personAttr.flag) {
|
||||
hasPerson = true;
|
||||
}
|
||||
TermUtil.insertTerm(terms, term, InsertTermType.REPLACE);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 取得最优路径的root Term
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
protected Term optimalRoot() {
|
||||
Term to = end;
|
||||
to.clearScore();
|
||||
Term from = null;
|
||||
while ((from = to.from()) != null) {
|
||||
for (int i = from.getOffe() + 1; i < to.getOffe(); i++) {
|
||||
terms[i] = null;
|
||||
}
|
||||
if (from.getOffe() > -1) {
|
||||
terms[from.getOffe()] = from;
|
||||
}
|
||||
// 断开横向链表.节省内存
|
||||
from.setNext(null);
|
||||
from.setTo(to);
|
||||
from.clearScore();
|
||||
to = from;
|
||||
}
|
||||
return root;
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除最短的节点
|
||||
*/
|
||||
public void rmLittlePath() {
|
||||
int maxTo = -1;
|
||||
Term temp = null;
|
||||
Term maxTerm = null;
|
||||
// 是否有交叉
|
||||
boolean flag = false;
|
||||
final int length = terms.length - 1;
|
||||
for (int i = 0; i < length; i++) {
|
||||
maxTerm = getMaxTerm(i);
|
||||
if (maxTerm == null)
|
||||
continue;
|
||||
|
||||
maxTo = maxTerm.toValue();
|
||||
|
||||
/**
|
||||
* 对字数进行优化.如果一个字.就跳过..两个字.且第二个为null则.也跳过.从第二个后开始
|
||||
*/
|
||||
switch (maxTerm.getName().length()) {
|
||||
case 1:
|
||||
continue;
|
||||
case 2:
|
||||
if (terms[i + 1] == null) {
|
||||
i = i + 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否有交叉
|
||||
*/
|
||||
for (int j = i + 1; j < maxTo; j++) {
|
||||
temp = getMaxTerm(j);
|
||||
if (temp == null) {
|
||||
continue;
|
||||
}
|
||||
if (maxTo < temp.toValue()) {
|
||||
maxTo = temp.toValue();
|
||||
flag = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (flag) {
|
||||
i = maxTo - 1;
|
||||
flag = false;
|
||||
} else {
|
||||
maxTerm.setNext(null);
|
||||
terms[i] = maxTerm;
|
||||
for (int j = i + 1; j < maxTo; j++) {
|
||||
terms[j] = null;
|
||||
}
|
||||
// FIXME: 这里理论上得设置。但是跑了这么久,还不发生错误。应该是不依赖于双向链接。需要确认下。这段代码是否有用
|
||||
// //将下面的to的from设置回来
|
||||
// temp = terms[i+maxTerm.getName().length()] ;
|
||||
// do{
|
||||
// temp.setFrom(maxTerm) ;
|
||||
// }while((temp=temp.next())!=null) ;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 得道最到本行最大term,也就是最右面的term
|
||||
*
|
||||
* @param i
|
||||
* @return
|
||||
*/
|
||||
private Term getMaxTerm(int i) {
|
||||
Term maxTerm = terms[i];
|
||||
if (maxTerm == null) {
|
||||
return null;
|
||||
}
|
||||
Term term = maxTerm;
|
||||
while ((term = term.next()) != null) {
|
||||
maxTerm = term;
|
||||
}
|
||||
return maxTerm;
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除无意义的节点,防止viterbi太多
|
||||
*/
|
||||
public void rmLittleSinglePath() {
|
||||
int maxTo = -1;
|
||||
Term temp = null;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (terms[i] == null)
|
||||
continue;
|
||||
maxTo = terms[i].toValue();
|
||||
if (maxTo - i == 1 || i + 1 == terms.length)
|
||||
continue;
|
||||
for (int j = i; j < maxTo; j++) {
|
||||
temp = terms[j];
|
||||
if (temp != null && temp.toValue() <= maxTo && temp.getName().length() == 1) {
|
||||
terms[j] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除小节点。保证被删除的小节点的单个分数小于等于大节点的分数
|
||||
*/
|
||||
public void rmLittlePathByScore() {
|
||||
int maxTo = -1;
|
||||
Term temp = null;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (terms[i] == null) {
|
||||
continue;
|
||||
}
|
||||
Term maxTerm = null;
|
||||
double maxScore = 0;
|
||||
Term term = terms[i];
|
||||
// 找到自身分数对大最长的
|
||||
|
||||
do {
|
||||
if (maxTerm == null || maxScore > term.score()) {
|
||||
maxTerm = term;
|
||||
} else if (maxScore == term.score() && maxTerm.getName().length() < term.getName().length()) {
|
||||
maxTerm = term;
|
||||
}
|
||||
|
||||
} while ((term = term.next()) != null);
|
||||
term = maxTerm;
|
||||
do {
|
||||
maxTo = term.toValue();
|
||||
maxScore = term.score();
|
||||
if (maxTo - i == 1 || i + 1 == terms.length)
|
||||
continue;
|
||||
boolean flag = true;// 可以删除
|
||||
out: for (int j = i; j < maxTo; j++) {
|
||||
temp = terms[j];
|
||||
if (temp == null) {
|
||||
continue;
|
||||
}
|
||||
do {
|
||||
if (temp.toValue() > maxTo || temp.score() < maxScore) {
|
||||
flag = false;
|
||||
break out;
|
||||
}
|
||||
} while ((temp = temp.next()) != null);
|
||||
}
|
||||
// 验证通过可以删除了
|
||||
if (flag) {
|
||||
for (int j = i + 1; j < maxTo; j++) {
|
||||
terms[j] = null;
|
||||
}
|
||||
}
|
||||
} while ((term = term.next()) != null);
|
||||
}
|
||||
}
|
||||
|
||||
public void walkPathByScore() {
|
||||
Term term = null;
|
||||
// BEGIN先行打分
|
||||
mergerByScore(root, 0);
|
||||
// 从第一个词开始往后打分
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
term = terms[i];
|
||||
while (term != null && term.from() != null && term != end) {
|
||||
int to = term.toValue();
|
||||
mergerByScore(term, to);
|
||||
term = term.next();
|
||||
}
|
||||
}
|
||||
optimalRoot();
|
||||
}
|
||||
|
||||
public void walkPath() {
|
||||
walkPath(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* 干涉性增加相对权重
|
||||
*
|
||||
* @param relationMap
|
||||
*/
|
||||
public void walkPath(Map<String, Double> relationMap) {
|
||||
Term term = null;
|
||||
// BEGIN先行打分
|
||||
merger(root, 0, relationMap);
|
||||
// 从第一个词开始往后打分
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
term = terms[i];
|
||||
while (term != null && term.from() != null && term != end) {
|
||||
int to = term.toValue();
|
||||
merger(term, to, relationMap);
|
||||
term = term.next();
|
||||
}
|
||||
}
|
||||
optimalRoot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 具体的遍历打分方法
|
||||
*
|
||||
* @param i 起始位置
|
||||
* @param j 起始属性
|
||||
* @param to
|
||||
*/
|
||||
private void merger(Term fromTerm, int to, Map<String, Double> relationMap) {
|
||||
Term term = null;
|
||||
if (terms[to] != null) {
|
||||
term = terms[to];
|
||||
while (term != null) {
|
||||
// 关系式to.set(from)
|
||||
term.setPathScore(fromTerm, relationMap);
|
||||
term = term.next();
|
||||
}
|
||||
} else {
|
||||
char c = chars[to];
|
||||
TermNatures tn = DATDictionary.getItem(c).termNatures;
|
||||
if (tn == null || tn == TermNatures.NULL) {
|
||||
tn = TermNatures.NULL;
|
||||
}
|
||||
terms[to] = new Term(String.valueOf(c), to, tn);
|
||||
terms[to].setPathScore(fromTerm, relationMap);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据分数
|
||||
*
|
||||
* @param i 起始位置
|
||||
* @param j 起始属性
|
||||
* @param to
|
||||
*/
|
||||
private void mergerByScore(Term fromTerm, int to) {
|
||||
Term term = null;
|
||||
if (terms[to] != null) {
|
||||
term = terms[to];
|
||||
while (term != null) {
|
||||
// 关系式to.set(from)
|
||||
term.setPathSelfScore(fromTerm);
|
||||
term = term.next();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 对graph进行调试用的
|
||||
*/
|
||||
public void printGraph() {
|
||||
for (Term term : terms) {
|
||||
if (term == null) {
|
||||
continue;
|
||||
}
|
||||
System.out.print(term.getName() + "\t" + term.score() + " ,");
|
||||
while ((term = term.next()) != null) {
|
||||
System.out.print(term + "\t" + term.score() + " ,");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,108 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.library.NatureLibrary;
|
||||
import org.ansj.library.NgramLibrary;
|
||||
import org.ansj.recognition.impl.NatureRecognition.NatureTerm;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class MathUtil {
|
||||
|
||||
// 平滑参数
|
||||
private static final double D_SMOOTHING_PARA = 0.1;
|
||||
// 分隔符我最喜欢的
|
||||
private static final String TAB = "\t";
|
||||
// 一个参数
|
||||
private static final int MAX_FREQUENCE = 2079997;// 7528283+329805;
|
||||
// Two linked Words frequency
|
||||
private static final double D_TEMP = (double) 1 / MAX_FREQUENCE;
|
||||
|
||||
/**
|
||||
* 从一个词的词性到另一个词的词的分数
|
||||
*
|
||||
* @param form
|
||||
* 前面的词
|
||||
* @param to
|
||||
* 后面的词
|
||||
* @return 分数
|
||||
*/
|
||||
public static double compuScore(Term from, Term to, Map<String, Double> relationMap) {
|
||||
double frequency = from.termNatures().allFreq + 1;
|
||||
|
||||
if (frequency < 0) {
|
||||
double score = from.score() + MAX_FREQUENCE;
|
||||
from.score(score);
|
||||
return score;
|
||||
}
|
||||
|
||||
double nTwoWordsFreq = NgramLibrary.getTwoWordFreq(from, to);
|
||||
|
||||
if (relationMap != null) {
|
||||
Double d = relationMap.get(from.getName() + TAB + to.getName());
|
||||
if (d != null) {
|
||||
nTwoWordsFreq += d;
|
||||
}
|
||||
}
|
||||
|
||||
double value = -Math.log(D_SMOOTHING_PARA * frequency / (MAX_FREQUENCE + 80000)
|
||||
+ (1 - D_SMOOTHING_PARA) * ((1 - D_TEMP) * nTwoWordsFreq / frequency + D_TEMP));
|
||||
|
||||
if (value < 0) {
|
||||
value += frequency;
|
||||
}
|
||||
return from.score() + value;
|
||||
}
|
||||
|
||||
/**
|
||||
* 词性词频词长.计算出来一个分数
|
||||
*
|
||||
* @param from
|
||||
* @param term
|
||||
* @return
|
||||
*/
|
||||
public static double compuScoreFreq(Term from, Term term) {
|
||||
return from.termNatures().allFreq + term.termNatures().allFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* 两个词性之间的分数计算
|
||||
*
|
||||
* @param from
|
||||
* @param to
|
||||
* @return
|
||||
*/
|
||||
public static double compuNatureFreq(NatureTerm from, NatureTerm to) {
|
||||
double twoWordFreq = NatureLibrary.getTwoNatureFreq(from.termNature.nature, to.termNature.nature);
|
||||
if (twoWordFreq == 0) {
|
||||
twoWordFreq = Math.log(from.selfScore + to.selfScore);
|
||||
}
|
||||
double score = from.score + Math.log((from.selfScore + to.selfScore) * twoWordFreq) + to.selfScore;
|
||||
return score;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.out.println(Math.log(D_TEMP * 2));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
public class MatrixUtil {
|
||||
|
||||
/**
|
||||
* 向量求和
|
||||
*
|
||||
* @param dbs
|
||||
* @return
|
||||
*/
|
||||
public static double sum(double[] dbs) {
|
||||
double value = 0;
|
||||
for (double d : dbs) {
|
||||
value += d;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public static int sum(int[] dbs) {
|
||||
int value = 0;
|
||||
for (int d : dbs) {
|
||||
value += d;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public static double sum(double[][] w) {
|
||||
|
||||
double value = 0;
|
||||
for (double[] dbs : w) {
|
||||
value += sum(dbs);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public static void dot(double[] feature, double[] feature1) {
|
||||
if (feature1 == null) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < feature1.length; i++) {
|
||||
feature[i] += feature1[i];
|
||||
}
|
||||
}
|
||||
|
||||
public static void dot(float[] feature, float[] feature1) {
|
||||
if (feature1 == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (feature == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
int min = Math.min(feature.length, feature1.length);
|
||||
|
||||
for (int i = 0; i < min; i++) {
|
||||
feature[i] += feature1[i];
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,389 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
import org.ansj.app.crf.SplitWord;
|
||||
import org.ansj.dic.DicReader;
|
||||
import org.ansj.dic.impl.Jdbc2Stream;
|
||||
import org.ansj.domain.AnsjItem;
|
||||
import org.ansj.exception.LibraryException;
|
||||
import org.ansj.library.*;
|
||||
import org.ansj.recognition.impl.StopRecognition;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.tire.domain.SmartForest;
|
||||
import org.nlpcn.commons.lang.util.FileFinder;
|
||||
import org.nlpcn.commons.lang.util.IOUtil;
|
||||
import org.nlpcn.commons.lang.util.ObjConver;
|
||||
import org.nlpcn.commons.lang.util.StringUtil;
|
||||
import org.nlpcn.commons.lang.util.logging.Log;
|
||||
import org.nlpcn.commons.lang.util.logging.LogFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.PropertyResourceBundle;
|
||||
import java.util.ResourceBundle;
|
||||
|
||||
/**
|
||||
* 这个类储存一些公用变量.
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class MyStaticValue {
|
||||
|
||||
public static final Log LOG = LogFactory.getLog(MyStaticValue.class);
|
||||
|
||||
// 是否开启人名识别
|
||||
public static Boolean isNameRecognition = true;
|
||||
|
||||
// 是否开启数字识别
|
||||
public static Boolean isNumRecognition = true;
|
||||
|
||||
// 是否数字和量词合并
|
||||
public static Boolean isQuantifierRecognition = true;
|
||||
|
||||
// 是否显示真实词语
|
||||
public static Boolean isRealName = false;
|
||||
|
||||
/**
|
||||
* 是否用户辞典不加载相同的词
|
||||
*/
|
||||
public static boolean isSkipUserDefine = false;
|
||||
|
||||
public static final Map<String, String> ENV = new HashMap<>();
|
||||
|
||||
static {
|
||||
/**
|
||||
* 配置文件变量
|
||||
*/
|
||||
ResourceBundle rb = null;
|
||||
try {
|
||||
rb = ResourceBundle.getBundle("ansj_library");
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
File find = FileFinder.find("ansj_library.properties", 1);
|
||||
if (find != null && find.isFile()) {
|
||||
rb = new PropertyResourceBundle(
|
||||
IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding")));
|
||||
LOG.info("load ansj_library not find in classPath ! i find it in " + find.getAbsolutePath()
|
||||
+ " make sure it is your config!");
|
||||
}
|
||||
} catch (Exception e1) {
|
||||
LOG.warn("not find ansj_library.properties. reason: " + e1.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
if (rb == null) {
|
||||
try {
|
||||
rb = ResourceBundle.getBundle("library");
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
File find = FileFinder.find("library.properties", 2);
|
||||
if (find != null && find.isFile()) {
|
||||
rb = new PropertyResourceBundle(
|
||||
IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding")));
|
||||
LOG.info("load library not find in classPath ! i find it in " + find.getAbsolutePath()
|
||||
+ " make sure it is your config!");
|
||||
}
|
||||
} catch (Exception e1) {
|
||||
LOG.warn("not find library.properties. reason: " + e1.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rb == null) {
|
||||
LOG.warn("not find library.properties in classpath use it by default !");
|
||||
} else {
|
||||
|
||||
for (String key : rb.keySet()) {
|
||||
ENV.put(key, rb.getString(key));
|
||||
try {
|
||||
String value = rb.getString(key);
|
||||
if (value.startsWith("jdbc:")) { //给jdbc窜中密码做一个加密,不让密码明文在日志中
|
||||
value = Jdbc2Stream.encryption(value);
|
||||
}
|
||||
LOG.info("init " + key + " to env value is : " + value);
|
||||
Field field = MyStaticValue.class.getField(key);
|
||||
field.set(null, ObjConver.conversion(rb.getString(key), field.getType()));
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 人名词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getPersonReader() {
|
||||
return DicReader.getReader("person/person.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 机构名词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getCompanReader() {
|
||||
return DicReader.getReader("company/company.data");
|
||||
}
|
||||
|
||||
/**
|
||||
* 机构名词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getNewWordReader() {
|
||||
return DicReader.getReader("newWord/new_word_freq.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getArraysReader() {
|
||||
return DicReader.getReader("arrays.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 数字词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getNumberReader() {
|
||||
return DicReader.getReader("numberLibrary.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 英文词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getEnglishReader() {
|
||||
return DicReader.getReader("englishLibrary.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 词性表
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getNatureMapReader() {
|
||||
return DicReader.getReader("nature/nature.map");
|
||||
}
|
||||
|
||||
/**
|
||||
* 词性关联表
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getNatureTableReader() {
|
||||
return DicReader.getReader("nature/nature.table");
|
||||
}
|
||||
|
||||
/**
|
||||
* 得道姓名单字的词频词典
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getNatureClassSuffix() {
|
||||
return DicReader.getReader("nature_class_suffix.txt");
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据词语后缀判断词性
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static BufferedReader getPersonFreqReader() {
|
||||
return DicReader.getReader("person/name_freq.dic");
|
||||
}
|
||||
|
||||
/**
|
||||
* 名字词性对象反序列化
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static Map<String, int[][]> getPersonFreqMap() {
|
||||
Map<String, int[][]> map = new HashMap<>(0);
|
||||
try (InputStream inputStream = DicReader.getInputStream("person/asian_name_freq.data")) {
|
||||
ObjectInputStream objectInputStream = new ObjectInputStream(inputStream);
|
||||
map = (Map<String, int[][]>) objectInputStream.readObject();
|
||||
} catch (IOException e) {
|
||||
LOG.warn("IO异常", e);
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOG.warn("找不到类", e);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* 词与词之间的关联表数据
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static void initBigramTables() {
|
||||
try (BufferedReader reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8")) {
|
||||
String temp = null;
|
||||
String[] strs = null;
|
||||
int freq = 0;
|
||||
while ((temp = reader.readLine()) != null) {
|
||||
if (StringUtil.isBlank(temp)) {
|
||||
continue;
|
||||
}
|
||||
strs = temp.split("\t");
|
||||
freq = Integer.parseInt(strs[1]);
|
||||
strs = strs[0].split("@");
|
||||
AnsjItem fromItem = DATDictionary.getItem(strs[0]);
|
||||
|
||||
AnsjItem toItem = DATDictionary.getItem(strs[1]);
|
||||
|
||||
if (fromItem == AnsjItem.NULL && strs[0].contains("#")) {
|
||||
fromItem = AnsjItem.BEGIN;
|
||||
}
|
||||
|
||||
if (toItem == AnsjItem.NULL && strs[1].contains("#")) {
|
||||
toItem = AnsjItem.END;
|
||||
}
|
||||
|
||||
if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fromItem.bigramEntryMap == null) {
|
||||
fromItem.bigramEntryMap = new HashMap<Integer, Integer>();
|
||||
}
|
||||
|
||||
fromItem.bigramEntryMap.put(toItem.getIndex(), freq);
|
||||
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
LOG.warn("数字格式异常", e);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
LOG.warn("不支持的编码", e);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("IO异常", e);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* 外部引用为了实例化加载变量
|
||||
*/
|
||||
public static Log getLog(Class<?> clazz) {
|
||||
return LogFactory.getLog(clazz);
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加一个词典
|
||||
*
|
||||
* @param key
|
||||
* @param path
|
||||
* @param value
|
||||
*/
|
||||
public static void putLibrary(String key, String path, Object value) {
|
||||
if (key.startsWith(DicLibrary.DEFAULT)) {
|
||||
DicLibrary.put(key, path, (Forest) value);
|
||||
} else if (key.startsWith(StopLibrary.DEFAULT)) {
|
||||
StopLibrary.put(key, path, (StopRecognition) value);
|
||||
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
|
||||
SynonymsLibrary.put(key, path, (SmartForest) value);
|
||||
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
|
||||
AmbiguityLibrary.put(key, path, (Forest) value);
|
||||
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
|
||||
CrfLibrary.put(key, path, (SplitWord) value);
|
||||
} else {
|
||||
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
|
||||
}
|
||||
ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 懒加载一个词典
|
||||
*
|
||||
* @param key
|
||||
* @param path
|
||||
*/
|
||||
public static void putLibrary(String key, String path) {
|
||||
if (key.startsWith(DicLibrary.DEFAULT)) {
|
||||
DicLibrary.put(key, path);
|
||||
} else if (key.startsWith(StopLibrary.DEFAULT)) {
|
||||
StopLibrary.put(key, path);
|
||||
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
|
||||
SynonymsLibrary.put(key, path);
|
||||
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
|
||||
AmbiguityLibrary.put(key, path);
|
||||
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
|
||||
CrfLibrary.put(key, path);
|
||||
} else {
|
||||
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
|
||||
}
|
||||
ENV.put(key, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除一个词典
|
||||
*
|
||||
* @param key
|
||||
*/
|
||||
public static void removeLibrary(String key) {
|
||||
if (key.startsWith(DicLibrary.DEFAULT)) {
|
||||
DicLibrary.remove(key);
|
||||
} else if (key.startsWith(StopLibrary.DEFAULT)) {
|
||||
StopLibrary.remove(key);
|
||||
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
|
||||
SynonymsLibrary.remove(key);
|
||||
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
|
||||
AmbiguityLibrary.remove(key);
|
||||
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
|
||||
CrfLibrary.remove(key);
|
||||
} else {
|
||||
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
|
||||
}
|
||||
ENV.remove(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* 重置一个词典
|
||||
*
|
||||
* @param key
|
||||
*/
|
||||
public static void reloadLibrary(String key) {
|
||||
if (key.startsWith(DicLibrary.DEFAULT)) {
|
||||
DicLibrary.reload(key);
|
||||
} else if (key.startsWith(StopLibrary.DEFAULT)) {
|
||||
StopLibrary.reload(key);
|
||||
} else if (key.startsWith(SynonymsLibrary.DEFAULT)) {
|
||||
SynonymsLibrary.reload(key);
|
||||
} else if (key.startsWith(AmbiguityLibrary.DEFAULT)) {
|
||||
AmbiguityLibrary.reload(key);
|
||||
} else if (key.startsWith(CrfLibrary.DEFAULT)) {
|
||||
CrfLibrary.reload(key);
|
||||
} else {
|
||||
throw new LibraryException(key + " type err must start with dic,stop,ambiguity,synonyms");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,72 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.recognition.impl.NatureRecognition;
|
||||
import org.nlpcn.commons.lang.tire.domain.Forest;
|
||||
import org.nlpcn.commons.lang.util.WordAlert;
|
||||
|
||||
public class NameFix {
|
||||
/**
|
||||
* 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接
|
||||
*/
|
||||
public static void nameAmbiguity(Term[] terms, Forest... forests) {
|
||||
Term from = null;
|
||||
Term term = null;
|
||||
Term next = null;
|
||||
for (int i = 0; i < terms.length - 1; i++) {
|
||||
term = terms[i];
|
||||
if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
|
||||
next = terms[i + 2];
|
||||
if (next.termNatures().personAttr.split > 0) {
|
||||
term.setName(term.getName() + next.getName().charAt(0));
|
||||
terms[i + 2] = null;
|
||||
|
||||
String name = next.getName().substring(1);
|
||||
terms[i + 3] = new Term(name, next.getOffe() + 1,
|
||||
new NatureRecognition(forests).getTermNatures(name));
|
||||
TermUtil.termLink(term, terms[i + 3]);
|
||||
TermUtil.termLink(terms[i + 3], next.to());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 外国人名修正
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
term = terms[i];
|
||||
if (term != null && term.getName().length() == 1 && i > 0
|
||||
&& WordAlert.CharCover(term.getName().charAt(0)) == '·') {
|
||||
from = term.from();
|
||||
next = term.to();
|
||||
|
||||
if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
|
||||
from.setName(from.getName() + term.getName() + next.getName());
|
||||
TermUtil.termLink(from, next.to());
|
||||
terms[i] = null;
|
||||
terms[i + 1] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,220 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.ansj.util;
|
||||
|
||||
import org.ansj.domain.Nature;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.domain.TermNatures;
|
||||
import org.ansj.library.NatureLibrary;
|
||||
import org.ansj.library.company.CompanyAttrLibrary;
|
||||
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* term的操作类
|
||||
*
|
||||
* @author ansj
|
||||
*
|
||||
*/
|
||||
public class TermUtil {
|
||||
|
||||
/**
|
||||
* 将两个term合并为一个全新的term
|
||||
*
|
||||
* @param termNatures
|
||||
* @return
|
||||
*/
|
||||
public static Term makeNewTermNum(Term from, Term to, TermNatures termNatures) {
|
||||
Term term = new Term(from.getName() + to.getName(), from.getOffe(), termNatures);
|
||||
term.termNatures().numAttr = from.termNatures().numAttr;
|
||||
TermUtil.termLink(term, to.to());
|
||||
TermUtil.termLink(term.from(), term);
|
||||
return term;
|
||||
}
|
||||
|
||||
public static void termLink(Term from, Term to) {
|
||||
if (from == null || to == null)
|
||||
return;
|
||||
from.setTo(to);
|
||||
to.setFrom(from);
|
||||
}
|
||||
|
||||
public static enum InsertTermType {
|
||||
/**
|
||||
* 跳过 0
|
||||
*/
|
||||
SKIP,
|
||||
/**
|
||||
* 替换 1
|
||||
*/
|
||||
REPLACE,
|
||||
/**
|
||||
* 累积分值 保证顺序,由大到小 2
|
||||
*/
|
||||
SCORE_ADD_SORT
|
||||
}
|
||||
|
||||
/**
|
||||
* 将一个term插入到链表中的对应位置中, 如果这个term已经存在参照type type 0.跳过 1. 替换 2.累积分值 保证顺序,由大到小
|
||||
*
|
||||
* @param terms
|
||||
* @param term
|
||||
*/
|
||||
public static void insertTerm(Term[] terms, Term term, InsertTermType type) {
|
||||
Term self = terms[term.getOffe()];
|
||||
|
||||
if (self == null) {
|
||||
terms[term.getOffe()] = term;
|
||||
return;
|
||||
}
|
||||
|
||||
int len = term.getName().length();
|
||||
|
||||
// 如果是第一位置
|
||||
if (self.getName().length() == len) {
|
||||
if (type == InsertTermType.REPLACE) {
|
||||
term.setNext(self.next());
|
||||
terms[term.getOffe()] = term;
|
||||
} else if (type == InsertTermType.SCORE_ADD_SORT) {
|
||||
self.score(self.score() + term.score());
|
||||
self.selfScore(self.selfScore() + term.selfScore());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (self.getName().length() > len) {
|
||||
term.setNext(self);
|
||||
terms[term.getOffe()] = term;
|
||||
return;
|
||||
}
|
||||
|
||||
Term next = self;
|
||||
Term before = self;
|
||||
while ((next = before.next()) != null) {
|
||||
if (next.getName().length() == len) {
|
||||
if (type == InsertTermType.REPLACE) {
|
||||
term.setNext(next.next());
|
||||
before.setNext(term);
|
||||
} else if (type == InsertTermType.SCORE_ADD_SORT) {
|
||||
next.score(next.score() + term.score());
|
||||
next.selfScore(next.selfScore() + term.selfScore());
|
||||
}
|
||||
return;
|
||||
} else if (next.getName().length() > len) {
|
||||
before.setNext(term);
|
||||
term.setNext(next);
|
||||
return;
|
||||
}
|
||||
before = next;
|
||||
}
|
||||
|
||||
before.setNext(term); // 如果都没有命中
|
||||
}
|
||||
|
||||
public static void insertTermNum(Term[] terms, Term term) {
|
||||
terms[term.getOffe()] = term;
|
||||
}
|
||||
|
||||
public static void insertTerm(Term[] terms, List<Term> tempList, TermNatures nr) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int offe = tempList.get(0).getOffe();
|
||||
for (Term term : tempList) {
|
||||
sb.append(term.getName());
|
||||
terms[term.getOffe()] = null;
|
||||
}
|
||||
Term term = new Term(sb.toString(), offe, TermNatures.NR);
|
||||
insertTermNum(terms, term);
|
||||
}
|
||||
|
||||
protected static Term setToAndfrom(Term to, Term from) {
|
||||
|
||||
from.setTo(to);
|
||||
to.setFrom(from);
|
||||
return from;
|
||||
}
|
||||
|
||||
private static final HashMap<String, int[]> companyMap = CompanyAttrLibrary.getCompanyMap();
|
||||
|
||||
/**
|
||||
* 得到细颗粒度的分词,并且确定词性
|
||||
*
|
||||
* @return 返回是null说明已经是最细颗粒度
|
||||
*/
|
||||
public static void parseNature(Term term) {
|
||||
if (!Nature.NW.equals(term.natrue())) {
|
||||
return;
|
||||
}
|
||||
|
||||
String name = term.getName();
|
||||
|
||||
if (name.length() <= 3) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 是否是外国人名
|
||||
if (ForeignPersonRecognition.isFName(name)) {
|
||||
term.setNature(NatureLibrary.getNature("nrf"));
|
||||
return;
|
||||
}
|
||||
|
||||
List<Term> subTerm = term.getSubTerm();
|
||||
|
||||
// 判断是否是机构名
|
||||
term.setSubTerm(subTerm);
|
||||
Term first = subTerm.get(0);
|
||||
Term last = subTerm.get(subTerm.size() - 1);
|
||||
int[] is = companyMap.get(first.getName());
|
||||
int all = 0;
|
||||
|
||||
is = companyMap.get(last.getName());
|
||||
if (is != null) {
|
||||
all += is[1];
|
||||
}
|
||||
|
||||
if (all > 1000) {
|
||||
term.setNature(NatureLibrary.getNature("nt"));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从from到to生成subterm
|
||||
*
|
||||
* @param terms
|
||||
* @param from
|
||||
* @param to
|
||||
* @return
|
||||
*/
|
||||
public static List<Term> getSubTerm(Term from, Term to) {
|
||||
|
||||
List<Term> subTerm = new ArrayList<>(3);
|
||||
|
||||
while ((from = from.to()) != to) {
|
||||
subTerm.add(from);
|
||||
}
|
||||
|
||||
return subTerm;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,83 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.deeplearning4j.nlp.chinese.tokenization.tokenizer;
|
||||
|
||||
import org.ansj.domain.Result;
|
||||
import org.ansj.domain.Term;
|
||||
import org.ansj.splitWord.analysis.NlpAnalysis;
|
||||
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
|
||||
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
|
||||
public class ChineseTokenizer implements Tokenizer {
|
||||
|
||||
private TokenPreProcess tokenPreProcess;
|
||||
private List<Term> tokenList;
|
||||
private Iterator<Term> tokenIter;
|
||||
|
||||
public ChineseTokenizer() {}
|
||||
|
||||
public ChineseTokenizer(String toTokenize) {
|
||||
Result result = NlpAnalysis.parse(toTokenize);
|
||||
this.tokenList = result.getTerms();
|
||||
this.tokenIter = tokenList.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMoreTokens() {
|
||||
return tokenIter.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int countTokens() {
|
||||
return tokenList != null ? tokenList.size() : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String nextToken() {
|
||||
if (!hasMoreTokens()) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
return this.tokenPreProcess != null ? this.tokenPreProcess.preProcess(tokenIter.next().getName())
|
||||
: tokenIter.next().getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getTokens() {
|
||||
ArrayList tokenList = new ArrayList();
|
||||
|
||||
while (hasMoreTokens()) {
|
||||
tokenList.add(nextToken());
|
||||
}
|
||||
return tokenList;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
|
||||
this.tokenPreProcess = tokenPreProcessor;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.deeplearning4j.nlp.chinese.tokenization.tokenizerFactory;
|
||||
|
||||
import org.deeplearning4j.nlp.chinese.tokenization.tokenizer.ChineseTokenizer;
|
||||
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
|
||||
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
|
||||
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
public class ChineseTokenizerFactory implements TokenizerFactory {
|
||||
|
||||
private TokenPreProcess tokenPreProcess;
|
||||
|
||||
@Override
|
||||
public Tokenizer create(String toTokenize) {
|
||||
Tokenizer tokenizer = new ChineseTokenizer(toTokenize);
|
||||
tokenizer.setTokenPreProcessor(tokenPreProcess);
|
||||
return tokenizer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(InputStream toTokenize) {
|
||||
throw new UnsupportedOperationException();
|
||||
/* Tokenizer t = new ChineseStreamTokenizer(toTokenize);
|
||||
t.setTokenPreProcessor(tokenPreProcess);
|
||||
return t;*/
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTokenPreProcessor(TokenPreProcess tokenPreProcess) {
|
||||
this.tokenPreProcess = tokenPreProcess;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenPreProcess getTokenPreProcessor() {
|
||||
return tokenPreProcess;
|
||||
}
|
||||
}
|
|
@ -1,201 +0,0 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,8 +0,0 @@
|
|||
ansj_seg
|
||||
Copyright 2011-2016 ansj_seg
|
||||
|
||||
the deeplearning4j-nlp-chinese
|
||||
Copyright 2017-2022 the deeplearning4j-nlp-chinese
|
||||
|
||||
This product includes software developed by The Apache Software
|
||||
Foundation (http://www.apache.org/).
|
Binary file not shown.
Before Width: | Height: | Size: 57 KiB |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,105 +0,0 @@
|
|||
a 4
|
||||
b 4
|
||||
c 4
|
||||
d 4
|
||||
e 4
|
||||
f 4
|
||||
g 4
|
||||
h 4
|
||||
i 4
|
||||
j 4
|
||||
k 4
|
||||
l 4
|
||||
m 4
|
||||
n 4
|
||||
o 4
|
||||
p 4
|
||||
q 4
|
||||
r 4
|
||||
s 4
|
||||
t 4
|
||||
u 4
|
||||
v 4
|
||||
w 4
|
||||
x 4
|
||||
y 4
|
||||
z 4
|
||||
A 4
|
||||
B 4
|
||||
C 4
|
||||
D 4
|
||||
E 4
|
||||
F 4
|
||||
G 4
|
||||
H 4
|
||||
I 4
|
||||
J 4
|
||||
K 4
|
||||
L 4
|
||||
M 4
|
||||
N 4
|
||||
O 4
|
||||
P 4
|
||||
Q 4
|
||||
R 4
|
||||
S 4
|
||||
T 4
|
||||
U 4
|
||||
V 4
|
||||
W 4
|
||||
X 4
|
||||
Y 4
|
||||
Z 4
|
||||
' 4
|
||||
a 4
|
||||
b 4
|
||||
c 4
|
||||
d 4
|
||||
e 4
|
||||
f 4
|
||||
g 4
|
||||
h 4
|
||||
i 4
|
||||
j 4
|
||||
k 4
|
||||
l 4
|
||||
m 4
|
||||
n 4
|
||||
o 4
|
||||
p 4
|
||||
q 4
|
||||
r 4
|
||||
s 4
|
||||
t 4
|
||||
u 4
|
||||
v 4
|
||||
w 4
|
||||
x 4
|
||||
y 4
|
||||
z 4
|
||||
A 4
|
||||
B 4
|
||||
C 4
|
||||
D 4
|
||||
E 4
|
||||
F 4
|
||||
G 4
|
||||
H 4
|
||||
I 4
|
||||
J 4
|
||||
K 4
|
||||
M 4
|
||||
L 4
|
||||
N 4
|
||||
O 4
|
||||
P 4
|
||||
Q 4
|
||||
R 4
|
||||
S 4
|
||||
T 4
|
||||
U 4
|
||||
V 4
|
||||
W 4
|
||||
X 4
|
||||
Y 4
|
||||
Z 4
|
File diff suppressed because it is too large
Load Diff
|
@ -1,50 +0,0 @@
|
|||
0 0 始##始 50610
|
||||
1 1 末##末 0
|
||||
2 2 a 34439
|
||||
3 3 ad 5899
|
||||
4 4 ag 311
|
||||
5 5 an 2838
|
||||
6 6 b 8734
|
||||
7 7 bg 5
|
||||
8 8 c 25473
|
||||
9 9 d 47714
|
||||
10 10 dg 126
|
||||
11 11 e 26
|
||||
12 12 f 17248
|
||||
13 13 h 48
|
||||
14 14 i 5001
|
||||
15 15 j 10293
|
||||
16 16 k 958
|
||||
17 17 l 6055
|
||||
18 18 m 41036
|
||||
19 19 mg 6
|
||||
20 20 n 237124
|
||||
21 21 ng 4497
|
||||
22 22 nr 20061
|
||||
23 23 ns 27777
|
||||
24 24 nt 3565
|
||||
25 25 nx 459
|
||||
26 26 nz 3728
|
||||
27 27 o 70
|
||||
28 28 p 39906
|
||||
29 29 q 24236
|
||||
30 30 r 32367
|
||||
31 31 rg 10
|
||||
32 32 s 3868
|
||||
33 33 t 20646
|
||||
34 34 tg 486
|
||||
35 35 u 5194
|
||||
36 36 ud 661
|
||||
37 37 ug 449
|
||||
38 38 uj 54477
|
||||
39 39 ul 10234
|
||||
40 40 uv 2121
|
||||
41 41 uz 1664
|
||||
42 42 v 184620
|
||||
43 43 vd 493
|
||||
44 44 vg 1866
|
||||
45 45 vn 42615
|
||||
46 46 w 173046
|
||||
47 47 y 1892
|
||||
48 48 yg 1
|
||||
49 49 z 1315
|
|
@ -1,50 +0,0 @@
|
|||
0 0 648 172 11 17 245 0 2702 1653 8 11 204 4 197 694 0 443 2859 1 6233 23 3275 3416 828 25 263 3 5245 11 6857 0 177 3512 27 10 0 0 1 0 0 0 5230 4 54 440 5047 2 0 58
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 173 833 8 6 127 62 0 451 296 2 0 84 0 13 80 4 125 896 0 11004 139 53 121 2 2 10 0 296 258 94 0 17 45 4 152 75 4 7616 167 1129 14 928 3 11 2264 6481 357 0 33
|
||||
0 0 72 35 0 0 1 0 1 79 0 0 3 0 5 2 0 19 4 0 3 0 0 0 0 0 0 0 124 0 5 0 0 1 1 0 0 0 0 0 0 0 5482 5 21 2 34 0 0 0
|
||||
0 6 10 0 1 0 1 1 10 8 0 0 2 0 0 0 0 1 4 0 53 16 2 1 0 0 0 0 2 0 2 0 0 0 0 0 0 0 15 0 1 0 57 0 7 1 107 3 0 0
|
||||
0 37 10 5 0 46 0 0 231 114 0 0 37 0 7 0 0 3 12 0 264 6 2 2 1 0 0 0 23 1 5 0 0 0 1 18 0 0 150 0 2 0 260 0 0 219 1375 7 0 0
|
||||
0 3 98 1 1 61 146 0 42 17 0 0 78 0 4 52 0 47 69 0 5576 19 3 82 9 6 16 0 17 3 26 0 18 7 0 12 1 0 804 0 2 0 121 0 3 801 581 5 0 3
|
||||
0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
|
||||
0 1 997 182 6 142 292 0 102 1507 2 0 100 9 128 264 0 234 645 0 6529 29 316 1099 91 15 94 1 1411 10 1134 0 145 292 7 18 0 0 2 0 0 0 6181 12 44 1800 1604 0 0 28
|
||||
0 5 6181 470 39 2 59 0 93 4042 17 0 46 0 440 16 1 294 740 1 307 19 21 41 4 3 2 4 3511 15 296 0 8 74 4 30 1 0 41 4 193 0 29677 33 359 34 517 8 0 62
|
||||
0 0 40 0 2 0 0 0 0 2 0 0 0 0 0 0 0 1 6 0 2 0 0 0 0 0 0 0 3 0 1 0 0 5 0 0 0 0 0 0 0 0 60 0 3 0 1 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 23 0 0 0
|
||||
0 26 185 144 3 13 43 0 133 1216 2 0 78 0 79 32 1 81 672 0 1254 25 24 120 8 1 6 2 414 35 203 0 19 16 0 66 0 0 1852 2 2 1 4237 12 39 170 5992 21 0 19
|
||||
0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 6 0 1 2 0 26 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 5 0 0 0 0
|
||||
0 45 9 5 1 0 2 0 43 69 3 0 44 0 18 2 2 8 14 1 75 6 4 8 0 0 1 0 61 5 8 0 2 3 1 32 1 0 962 3 318 1 299 0 7 8 2896 31 0 3
|
||||
0 26 113 42 0 5 281 0 194 280 0 0 244 0 10 1247 5 30 466 0 3195 16 14 124 6 1 19 0 248 6 82 0 44 69 4 101 0 0 379 1 0 0 1296 10 5 514 1213 1 0 2
|
||||
0 1 27 11 1 0 6 0 15 151 0 0 1 0 21 0 0 10 5 0 40 1 1 1 0 1 1 0 96 0 14 0 10 12 0 3 0 0 87 0 0 0 314 1 1 2 121 2 0 1
|
||||
0 41 65 23 1 8 5 0 161 183 0 0 95 0 12 15 0 25 53 0 605 13 15 16 1 0 1 0 100 1 44 0 8 13 0 61 1 1 896 3 86 1 581 4 7 175 2712 23 0 0
|
||||
0 294 1243 19 12 50 143 1 115 327 0 0 236 1 60 81 3 68 2580 0 6960 282 27 222 0 6 54 1 223 21924 50 0 24 69 13 9 0 0 503 1 16 0 1941 0 28 369 3040 12 0 29
|
||||
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0
|
||||
0 1604 4111 1345 37 438 1116 0 8461 12529 34 1 7892 2 846 1164 840 892 4265 0 42974 660 5538 922 111 40 164 13 6366 71 1503 1 421 1377 24 2153 0 1 19021 2 121 3 34836 145 284 12625 61590 328 0 253
|
||||
0 49 81 13 12 0 6 0 63 177 2 0 78 0 9 19 3 15 93 0 537 53 15 140 53 0 10 1 146 4 33 0 1 83 2 35 0 0 190 0 0 0 792 1 15 39 1708 14 0 5
|
||||
0 836 113 66 0 0 69 0 455 852 1 0 38 0 57 10 3 45 243 0 2412 38 428 116 9 1 4 1 1173 2 124 0 46 618 1 429 1 0 651 0 0 0 3848 7 367 38 6936 9 0 14
|
||||
0 77 293 102 2 28 243 0 504 701 0 0 448 1 33 565 5 138 839 0 10395 68 101 1677 122 6 595 0 582 10 252 0 235 1570 6 348 0 0 1632 0 0 0 2908 7 14 550 2704 9 0 7
|
||||
0 4 16 10 0 8 177 0 71 81 0 0 28 0 3 99 0 9 39 0 993 9 8 813 36 0 0 0 151 0 23 0 3 50 0 19 0 0 112 0 0 0 394 0 1 76 332 0 0 0
|
||||
0 0 7 1 0 0 4 0 3 7 0 0 4 0 1 0 20 0 10 0 170 2 1 0 0 0 2 0 6 6 1 0 1 0 0 4 0 0 23 0 0 0 19 0 0 16 151 0 0 0
|
||||
0 7 34 5 0 1 31 0 40 37 0 0 27 1 4 28 4 10 49 0 1912 6 3 28 8 13 20 0 25 2 15 0 7 9 0 34 0 0 100 0 0 0 152 1 3 76 1034 0 0 2
|
||||
0 1 2 0 0 0 0 0 0 2 0 1 0 0 0 0 0 0 4 0 7 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 10 0 7 1 14 0 0 1 17 0 0 0
|
||||
0 4 1334 179 3 42 560 0 9 665 1 0 408 2 145 1036 1 305 2072 1 11794 109 836 4298 736 29 216 3 457 24 4177 3 771 2747 38 16 0 0 3 9 0 1 5230 10 35 867 679 0 0 51
|
||||
0 95 1285 46 7 64 385 1 127 608 0 0 1204 2 102 276 0 139 699 0 7933 74 46 212 15 13 75 0 351 21 133 0 59 107 8 45 0 0 1122 3 26 0 2172 5 23 773 5875 46 0 59
|
||||
0 23 966 195 13 42 178 0 313 3150 2 0 175 3 225 126 0 202 1934 0 6688 530 20 704 3 2 26 3 1664 1021 528 0 134 245 4 147 0 0 2249 1 5 1 8152 12 35 771 1753 60 0 62
|
||||
0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 3 0 0 0
|
||||
0 12 71 9 0 5 31 0 34 222 1 1 21 0 21 18 0 26 152 0 1104 14 1 22 0 1 6 4 35 4 34 0 18 4 2 16 0 0 433 0 0 0 752 3 2 138 635 5 0 11
|
||||
0 90 203 97 3 3 33 0 104 927 0 0 1002 0 26 101 0 48 474 0 2936 244 74 332 16 5 25 0 1396 7 304 0 39 4576 125 84 0 0 1026 0 0 0 2751 3 13 219 3342 9 0 9
|
||||
0 7 2 2 0 0 0 0 2 47 0 0 5 0 1 1 0 5 7 0 27 5 2 2 0 0 1 0 43 1 1 0 0 16 34 6 0 0 21 0 0 0 98 0 7 3 137 2 0 1
|
||||
0 6 211 8 8 22 57 0 7 123 1 0 6 1 11 15 0 17 387 0 1646 425 0 26 2 0 2 0 54 13 73 0 15 8 30 8 0 0 87 0 11 0 1078 0 25 149 656 0 1 5
|
||||
0 0 146 0 0 2 1 0 1 203 0 0 1 0 59 0 0 11 10 0 27 0 1 2 0 0 2 3 5 1 21 0 3 0 0 2 0 0 0 2 0 1 108 0 1 2 3 4 0 39
|
||||
0 0 30 1 0 2 4 0 0 4 0 0 5 0 0 4 0 1 86 0 95 3 6 17 0 0 4 0 3 0 45 0 0 4 0 0 0 0 40 0 0 0 15 0 0 10 69 1 0 0
|
||||
0 17 3740 48 10 765 1145 0 47 624 1 0 171 4 236 490 0 358 2988 0 27302 46 543 1211 118 45 230 3 64 34 656 0 227 379 8 9 0 0 0 0 0 0 1266 13 15 8491 3027 22 0 124
|
||||
0 2 1003 21 5 26 179 0 8 349 2 0 31 0 69 122 0 82 1942 0 2626 43 149 494 30 9 45 0 270 59 664 0 81 207 3 3 0 0 19 0 0 0 785 5 5 496 370 6 0 24
|
||||
0 0 20 9 0 0 0 0 1 29 0 0 0 0 10 0 0 8 6 0 4 0 0 0 0 0 0 1 199 0 4 0 0 0 0 0 0 0 0 0 0 0 1803 1 4 1 19 0 0 2
|
||||
0 2 138 4 2 18 20 0 2 42 0 0 7 0 18 14 0 8 155 0 497 2 30 56 2 1 4 1 35 12 139 0 15 18 3 1 0 0 24 0 0 0 195 1 0 16 157 1 0 24
|
||||
0 661 6913 851 69 741 1741 1 2092 4988 19 0 2445 13 875 1852 13 1079 10903 1 34362 1107 1298 5012 369 62 413 6 6621 515 7086 3 802 1841 53 625 576 438 10591 10006 110 1622 30472 109 183 4728 29379 813 0 161
|
||||
0 0 10 15 0 0 0 0 0 28 0 0 1 0 0 0 0 1 0 0 4 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 2 1 0 0 408 2 0 2 16 0 0 0
|
||||
0 6 58 1 6 7 14 0 13 28 0 0 8 0 9 36 2 4 86 0 433 54 10 65 2 0 3 1 65 4 58 0 4 56 4 10 1 3 18 13 0 12 214 0 5 21 523 6 0 3
|
||||
0 262 286 108 0 57 134 0 2147 1160 0 0 1110 0 58 40 1 71 170 0 15841 109 9 38 0 3 6 0 317 2 89 0 51 42 2 344 0 0 2247 0 4 0 2683 13 8 3364 11806 25 0 8
|
||||
0 46148 2796 1646 49 92 1316 1 6660 10173 28 11 875 5 1188 1786 49 1189 4355 1 21974 298 7184 6332 982 169 1407 16 8087 153 7581 2 459 2568 75 337 4 2 1244 15 4 6 26623 71 227 2329 6280 42 0 207
|
||||
0 26 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 4 0 1 0 1837 15 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
|
||||
0 12 35 0 1 4 2 0 16 10 0 0 5 0 1 0 0 2 41 0 299 2 1 4 0 0 0 0 14 0 1 0 4 3 1 7 0 0 303 1 83 0 179 0 4 9 262 3 0 6
|
|
@ -1,996 +0,0 @@
|
|||
公司 nt 883182.0
|
||||
厂 nt 689589.0
|
||||
部 nt 337479.0
|
||||
中心 nt 159768.0
|
||||
商行 nt 71428.0
|
||||
办事处 nt 61678.0
|
||||
加工厂 nt 57852.0
|
||||
店 nt 52534.0
|
||||
站 nt 50292.0
|
||||
机械厂 nt 39491.0
|
||||
集团 nt 35976.0
|
||||
业 nt 32541.0
|
||||
业务部 nt 31541.0
|
||||
商店 nt 27311.0
|
||||
行 nt 24684.0
|
||||
处 nt 22266.0
|
||||
门市部 nt 19569.0
|
||||
总公司 nt 19119.0
|
||||
研究所 nt 18541.0
|
||||
经销处 nt 18359.0
|
||||
工作室 nt 18259.0
|
||||
加油站 nt 17740.0
|
||||
基地 nt 17597.0
|
||||
化工厂 nt 15048.0
|
||||
经营 nt 14226.0
|
||||
印刷厂 nt 13281.0
|
||||
服务部 nt 12905.0
|
||||
批发部 nt 12852.0
|
||||
专卖店 nt 11758.0
|
||||
事务所 nt 11576.0
|
||||
食品厂 nt 11366.0
|
||||
学校 nt 11126.0
|
||||
场 nt 11015.0
|
||||
合作社 nt 10824.0
|
||||
制造厂 nt 10702.0
|
||||
集团公司 nt 8663.0
|
||||
代表处 nt 8250.0
|
||||
所 nt 8187.0
|
||||
商场 nt 8042.0
|
||||
养殖场 nt 8034.0
|
||||
队 nt 7689.0
|
||||
学院 nt 7305.0
|
||||
超市 nt 7261.0
|
||||
修理厂 nt 6928.0
|
||||
总代理 nt 6703.0
|
||||
办 nt 6651.0
|
||||
营业部 nt 6625.0
|
||||
制衣厂 nt 6191.0
|
||||
铸造厂 nt 6010.0
|
||||
总汇 nt 5997.0
|
||||
服装厂 nt 5863.0
|
||||
总厂 nt 5703.0
|
||||
分厂 nt 5341.0
|
||||
实业 nt 5036.0
|
||||
社 nt 4896.0
|
||||
办公室 nt 4844.0
|
||||
协会 nt 4722.0
|
||||
坊 nt 4687.0
|
||||
总部 nt 4621.0
|
||||
药店 nt 4591.0
|
||||
局 nt 4590.0
|
||||
机构 nt 4569.0
|
||||
酒厂 nt 4496.0
|
||||
分店 nt 4248.0
|
||||
门市 nt 3942.0
|
||||
建材厂 nt 3931.0
|
||||
大酒店 nt 3913.0
|
||||
中学 nt 3849.0
|
||||
俱乐部 nt 3721.0
|
||||
旅行社 nt 3691.0
|
||||
企业 nt 3655.0
|
||||
鞋业 nt 3573.0
|
||||
酒店 nt 3473.0
|
||||
科技 nt 3367.0
|
||||
服装店 nt 3275.0
|
||||
工程部 nt 3231.0
|
||||
分部 nt 3222.0
|
||||
商务部 nt 3171.0
|
||||
城 nt 3118.0
|
||||
造纸厂 nt 3055.0
|
||||
仪器厂 nt 3051.0
|
||||
收购站 nt 3023.0
|
||||
网 nt 3019.0
|
||||
市场 nt 3018.0
|
||||
部门 nt 2974.0
|
||||
针织厂 nt 2954.0
|
||||
修配厂 nt 2904.0
|
||||
仪表厂 nt 2817.0
|
||||
农场 nt 2786.0
|
||||
股份公司 nt 2742.0
|
||||
工厂 nt 2739.0
|
||||
村委会 nt 2696.0
|
||||
心 nt 2683.0
|
||||
室 nt 2655.0
|
||||
服务站 nt 2654.0
|
||||
小学 nt 2448.0
|
||||
研究院 nt 2409.0
|
||||
专营店 nt 2386.0
|
||||
信用社 nt 2340.0
|
||||
饲料厂 nt 2326.0
|
||||
饭店 nt 2303.0
|
||||
书店 nt 2283.0
|
||||
苗圃 nt 2267.0
|
||||
管理所 nt 2258.0
|
||||
科 nt 2252.0
|
||||
分社 nt 2223.0
|
||||
网吧 nt 2185.0
|
||||
园 nt 2134.0
|
||||
大学 nt 2075.0
|
||||
点 nt 2067.0
|
||||
林场 nt 2021.0
|
||||
销售 nt 1991.0
|
||||
纺织厂 nt 1973.0
|
||||
支局 nt 1963.0
|
||||
药房 nt 1922.0
|
||||
管理局 nt 1907.0
|
||||
药业 nt 1907.0
|
||||
电厂 nt 1810.0
|
||||
院 nt 1810.0
|
||||
电子 nt 1790.0
|
||||
连锁店 nt 1779.0
|
||||
煤厂 nt 1705.0
|
||||
布厂 nt 1650.0
|
||||
馆 nt 1645.0
|
||||
冶炼厂 nt 1603.0
|
||||
粮站 nt 1600.0
|
||||
纸厂 nt 1572.0
|
||||
商城 nt 1565.0
|
||||
支公司 nt 1563.0
|
||||
制作厂 nt 1554.0
|
||||
委员会 nt 1534.0
|
||||
经销 nt 1530.0
|
||||
司 nt 1445.0
|
||||
批发 nt 1439.0
|
||||
代理 nt 1426.0
|
||||
营业厅 nt 1381.0
|
||||
织造厂 nt 1380.0
|
||||
精品店 nt 1376.0
|
||||
煤矿 nt 1361.0
|
||||
营业所 nt 1356.0
|
||||
回收站 nt 1343.0
|
||||
营部 nt 1335.0
|
||||
招待所 nt 1319.0
|
||||
兽医站 nt 1308.0
|
||||
宾馆 nt 1301.0
|
||||
百货公司 nt 1296.0
|
||||
百货商店 nt 1291.0
|
||||
种子公司 nt 1277.0
|
||||
专卖 nt 1265.0
|
||||
牧业 nt 1257.0
|
||||
矿 nt 1200.0
|
||||
商社 nt 1196.0
|
||||
联络处 nt 1188.0
|
||||
工程队 nt 1168.0
|
||||
发行 nt 1164.0
|
||||
酒楼 nt 1156.0
|
||||
电器 nt 1155.0
|
||||
医院 nt 1139.0
|
||||
矿业 nt 1095.0
|
||||
五金店 nt 1061.0
|
||||
铺 nt 1046.0
|
||||
贸易 nt 1033.0
|
||||
用品 nt 1031.0
|
||||
丝厂 nt 1029.0
|
||||
供销社 nt 1024.0
|
||||
粮店 nt 1009.0
|
||||
幼儿园 nt 991.0
|
||||
化工 nt 986.0
|
||||
汽修厂 nt 968.0
|
||||
礼品店 nt 966.0
|
||||
分局 nt 961.0
|
||||
旅馆 nt 950.0
|
||||
维修 nt 913.0
|
||||
管理处 nt 913.0
|
||||
组 nt 901.0
|
||||
商贸 nt 896.0
|
||||
水厂 nt 895.0
|
||||
广场 nt 871.0
|
||||
餐厅 nt 865.0
|
||||
财政所 nt 864.0
|
||||
处理厂 nt 845.0
|
||||
卫生室 nt 829.0
|
||||
屋 nt 805.0
|
||||
服饰 nt 775.0
|
||||
邮政局 nt 749.0
|
||||
机械 nt 748.0
|
||||
玩具 nt 737.0
|
||||
伟业 nt 731.0
|
||||
生产厂 nt 713.0
|
||||
总店 nt 701.0
|
||||
家电 nt 692.0
|
||||
系 nt 684.0
|
||||
农业局 nt 679.0
|
||||
食品店 nt 675.0
|
||||
货运 nt 671.0
|
||||
分站 nt 669.0
|
||||
百货店 nt 656.0
|
||||
服部 nt 645.0
|
||||
店铺 nt 629.0
|
||||
设备 nt 625.0
|
||||
开发部 nt 620.0
|
||||
总站 nt 605.0
|
||||
轧钢厂 nt 596.0
|
||||
装饰 nt 596.0
|
||||
设计院 nt 589.0
|
||||
个体 nt 588.0
|
||||
业务 nt 570.0
|
||||
电信局 nt 565.0
|
||||
通讯 nt 563.0
|
||||
新华书店 nt 562.0
|
||||
大队 nt 559.0
|
||||
食杂店 nt 558.0
|
||||
广告 nt 555.0
|
||||
推广站 nt 553.0
|
||||
棉纺厂 nt 550.0
|
||||
世界 nt 544.0
|
||||
大厦 nt 538.0
|
||||
生产队 nt 537.0
|
||||
电器行 nt 531.0
|
||||
分场 nt 530.0
|
||||
玩具店 nt 528.0
|
||||
经办 nt 526.0
|
||||
卫生站 nt 519.0
|
||||
发电厂 nt 518.0
|
||||
农药厂 nt 517.0
|
||||
干洗店 nt 516.0
|
||||
配件 nt 516.0
|
||||
设计 nt 512.0
|
||||
花店 nt 511.0
|
||||
研究室 nt 511.0
|
||||
库 nt 510.0
|
||||
加工 nt 509.0
|
||||
分行 nt 505.0
|
||||
储备库 nt 501.0
|
||||
车间 nt 499.0
|
||||
化妆品 nt 493.0
|
||||
粮库 nt 492.0
|
||||
肉联厂 nt 483.0
|
||||
内贸部 nt 480.0
|
||||
文具店 nt 479.0
|
||||
服务 nt 476.0
|
||||
株式会社 nt 476.0
|
||||
阁 nt 461.0
|
||||
材料 nt 457.0
|
||||
支行 nt 445.0
|
||||
代销店 nt 442.0
|
||||
作坊 nt 441.0
|
||||
淀粉厂 nt 440.0
|
||||
经营户 nt 438.0
|
||||
服务队 nt 436.0
|
||||
杂志社 nt 420.0
|
||||
实验室 nt 420.0
|
||||
维修厂 nt 419.0
|
||||
商铺 nt 419.0
|
||||
服务处 nt 416.0
|
||||
厅 nt 414.0
|
||||
厂家 nt 403.0
|
||||
度假村 nt 397.0
|
||||
农业 nt 394.0
|
||||
电视台 nt 394.0
|
||||
传媒 nt 393.0
|
||||
金店 nt 391.0
|
||||
出版社 nt 385.0
|
||||
杂货店 nt 385.0
|
||||
建筑队 nt 380.0
|
||||
联盟 nt 379.0
|
||||
电脑 nt 372.0
|
||||
指挥部 nt 370.0
|
||||
涂料 nt 367.0
|
||||
工业 nt 360.0
|
||||
杂货铺 nt 357.0
|
||||
副食店 nt 357.0
|
||||
会 nt 355.0
|
||||
油坊 nt 354.0
|
||||
堂 nt 353.0
|
||||
仓库 nt 353.0
|
||||
时装店 nt 353.0
|
||||
网络 nt 351.0
|
||||
炼油厂 nt 351.0
|
||||
茶场 nt 347.0
|
||||
回收 nt 345.0
|
||||
高级中学 nt 344.0
|
||||
有限 nt 343.0
|
||||
热电厂 nt 341.0
|
||||
工区 nt 339.0
|
||||
经销商 nt 338.0
|
||||
介绍所 nt 335.0
|
||||
代理商 nt 333.0
|
||||
印染厂 nt 331.0
|
||||
检疫站 nt 331.0
|
||||
铁矿 nt 328.0
|
||||
家具 nt 324.0
|
||||
加盟店 nt 323.0
|
||||
银行 nt 321.0
|
||||
糖厂 nt 321.0
|
||||
连锁 nt 317.0
|
||||
物业 nt 315.0
|
||||
子公司 nt 312.0
|
||||
工会 nt 301.0
|
||||
酒家 nt 300.0
|
||||
楼 nt 297.0
|
||||
军 nt 295.0
|
||||
军区 nt 1.0
|
||||
贸 nt 294.0
|
||||
器材 nt 294.0
|
||||
工程 nt 292.0
|
||||
太阳能 nt 292.0
|
||||
旅社 nt 289.0
|
||||
饰品 nt 289.0
|
||||
种植园 nt 286.0
|
||||
置业 nt 286.0
|
||||
制品 nt 282.0
|
||||
煤场 nt 279.0
|
||||
良种场 nt 277.0
|
||||
销售点 nt 276.0
|
||||
国际 nt 276.0
|
||||
洗衣店 nt 276.0
|
||||
停车场 nt 275.0
|
||||
棉织厂 nt 274.0
|
||||
销售科 nt 274.0
|
||||
药厂 nt 274.0
|
||||
美容院 nt 273.0
|
||||
化工部 nt 273.0
|
||||
摄影 nt 272.0
|
||||
油漆厂 nt 270.0
|
||||
采购站 nt 269.0
|
||||
商厦 nt 266.0
|
||||
建材 nt 266.0
|
||||
分校 nt 265.0
|
||||
农庄 nt 264.0
|
||||
灯饰 nt 264.0
|
||||
理发店 nt 263.0
|
||||
苑 nt 262.0
|
||||
吧 nt 260.0
|
||||
数码 nt 258.0
|
||||
商 nt 256.0
|
||||
百货 nt 256.0
|
||||
材料部 nt 256.0
|
||||
鞋行 nt 255.0
|
||||
铝厂 nt 251.0
|
||||
旅店 nt 251.0
|
||||
商务 nt 249.0
|
||||
工学院 nt 248.0
|
||||
无限公司 nt 247.0
|
||||
造船厂 nt 246.0
|
||||
分理处 nt 245.0
|
||||
园区 nt 243.0
|
||||
五金 nt 240.0
|
||||
印刷 nt 240.0
|
||||
分中心 nt 240.0
|
||||
礼品 nt 238.0
|
||||
油库 nt 237.0
|
||||
培训部 nt 237.0
|
||||
庄园 nt 236.0
|
||||
专科学校 nt 230.0
|
||||
农技站 nt 228.0
|
||||
会馆 nt 228.0
|
||||
饮食店 nt 225.0
|
||||
师范学院 nt 222.0
|
||||
渔场 nt 222.0
|
||||
修理店 nt 222.0
|
||||
公寓 nt 221.0
|
||||
服装 nt 220.0
|
||||
食品 nt 220.0
|
||||
居 nt 219.0
|
||||
售票处 nt 218.0
|
||||
运输队 nt 218.0
|
||||
音响 nt 218.0
|
||||
经营者 nt 217.0
|
||||
收费站 nt 216.0
|
||||
零售店 nt 216.0
|
||||
货栈 nt 216.0
|
||||
专柜 nt 215.0
|
||||
大全 nt 215.0
|
||||
培训 nt 215.0
|
||||
镇政府 nt 214.0
|
||||
养鸡场 nt 214.0
|
||||
林 nt 211.0
|
||||
邮电所 nt 211.0
|
||||
联合会 nt 211.0
|
||||
润滑油 nt 210.0
|
||||
联社 nt 209.0
|
||||
商会 nt 209.0
|
||||
教育 nt 206.0
|
||||
发网 nt 206.0
|
||||
转运站 nt 202.0
|
||||
化学 nt 201.0
|
||||
照相馆 nt 201.0
|
||||
分会 nt 200.0
|
||||
山庄 nt 199.0
|
||||
纺 nt 199.0
|
||||
工艺 nt 198.0
|
||||
号 nt 197.0
|
||||
礼品部 nt 197.0
|
||||
包装 nt 196.0
|
||||
工艺品 nt 196.0
|
||||
师范大学 nt 193.0
|
||||
研究会 nt 193.0
|
||||
公证处 nt 190.0
|
||||
学会 nt 189.0
|
||||
家具城 nt 188.0
|
||||
百货大楼 nt 187.0
|
||||
工贸 nt 187.0
|
||||
兽药厂 nt 185.0
|
||||
轮胎 nt 183.0
|
||||
照明 nt 183.0
|
||||
养猪场 nt 182.0
|
||||
汇 nt 182.0
|
||||
珠宝店 nt 180.0
|
||||
通信 nt 178.0
|
||||
车站 nt 178.0
|
||||
科学院 nt 178.0
|
||||
咨询 nt 177.0
|
||||
制作 nt 176.0
|
||||
信息网 nt 176.0
|
||||
养殖 nt 175.0
|
||||
软件 nt 175.0
|
||||
科研所 nt 175.0
|
||||
食堂 nt 174.0
|
||||
变电站 nt 174.0
|
||||
示范园 nt 173.0
|
||||
轩 nt 173.0
|
||||
繁殖场 nt 173.0
|
||||
班 nt 172.0
|
||||
工商户 nt 171.0
|
||||
自选商场 nt 170.0
|
||||
大楼 nt 170.0
|
||||
机电 nt 167.0
|
||||
经理 nt 166.0
|
||||
团 nt 166.0
|
||||
医药 nt 166.0
|
||||
个体户 nt 165.0
|
||||
养蜂场 nt 164.0
|
||||
管委会 nt 162.0
|
||||
猪场 nt 162.0
|
||||
供电局 nt 162.0
|
||||
营业 nt 161.0
|
||||
本部 nt 161.0
|
||||
车队 nt 161.0
|
||||
果园 nt 160.0
|
||||
制造 nt 159.0
|
||||
沙龙 nt 159.0
|
||||
人民政府 nt 159.0
|
||||
体校 nt 158.0
|
||||
快餐店 nt 158.0
|
||||
个人 nt 157.0
|
||||
经销点 nt 154.0
|
||||
油公司 nt 154.0
|
||||
茶坊 nt 154.0
|
||||
纱厂 nt 153.0
|
||||
浴池 nt 153.0
|
||||
交易所 nt 152.0
|
||||
产品 nt 150.0
|
||||
厂部 nt 149.0
|
||||
技术学校 nt 148.0
|
||||
学生 nt 147.0
|
||||
检查站 nt 146.0
|
||||
医学院 nt 146.0
|
||||
在线 nt 145.0
|
||||
医务室 nt 145.0
|
||||
站台 nt 144.0
|
||||
美容 nt 142.0
|
||||
小吃店 nt 142.0
|
||||
校 nt 142.0
|
||||
中转站 nt 141.0
|
||||
租赁 nt 141.0
|
||||
电子部 nt 141.0
|
||||
果场 nt 141.0
|
||||
金行 nt 141.0
|
||||
技术 nt 139.0
|
||||
货场 nt 139.0
|
||||
外贸 nt 136.0
|
||||
采购 nt 136.0
|
||||
茶店 nt 135.0
|
||||
书屋 nt 135.0
|
||||
驾校 nt 135.0
|
||||
烤鸭店 nt 134.0
|
||||
客运站 nt 133.0
|
||||
营销 nt 132.0
|
||||
代办处 nt 132.0
|
||||
行业 nt 131.0
|
||||
冷库 nt 130.0
|
||||
饭庄 nt 130.0
|
||||
小卖部 nt 130.0
|
||||
物资部 nt 129.0
|
||||
管理 nt 129.0
|
||||
试验场 nt 129.0
|
||||
平台 nt 128.0
|
||||
商贸城 nt 128.0
|
||||
完小 nt 128.0
|
||||
孵化场 nt 128.0
|
||||
人事部 nt 127.0
|
||||
电气 nt 126.0
|
||||
屠宰场 nt 126.0
|
||||
修理 nt 125.0
|
||||
精品屋 nt 125.0
|
||||
内部 nt 125.0
|
||||
专营 nt 125.0
|
||||
渔业 nt 124.0
|
||||
园艺 nt 123.0
|
||||
联营厂 nt 123.0
|
||||
牧场 nt 123.0
|
||||
艺术团 nt 122.0
|
||||
开发 nt 121.0
|
||||
商学院 nt 120.0
|
||||
工务段 nt 120.0
|
||||
陶瓷 nt 120.0
|
||||
洗染店 nt 120.0
|
||||
模具 nt 119.0
|
||||
策划 nt 118.0
|
||||
初级中学 nt 118.0
|
||||
日化 nt 118.0
|
||||
供应 nt 117.0
|
||||
中专 nt 117.0
|
||||
促进会 nt 117.0
|
||||
拍卖行 nt 116.0
|
||||
编辑部 nt 116.0
|
||||
小组 nt 116.0
|
||||
示范场 nt 115.0
|
||||
商业 nt 115.0
|
||||
餐饮部 nt 115.0
|
||||
采油厂 nt 114.0
|
||||
师范学校 nt 113.0
|
||||
诊所 nt 111.0
|
||||
石化 nt 111.0
|
||||
总会 nt 111.0
|
||||
斋 nt 110.0
|
||||
火柴厂 nt 110.0
|
||||
工具 nt 110.0
|
||||
汽修 nt 110.0
|
||||
面包房 nt 110.0
|
||||
纺织 nt 109.0
|
||||
运输 nt 109.0
|
||||
机电部 nt 108.0
|
||||
组委会 nt 108.0
|
||||
采石场 nt 107.0
|
||||
布艺 nt 107.0
|
||||
精品 nt 107.0
|
||||
公路局 nt 107.0
|
||||
信息 nt 106.0
|
||||
支队 nt 106.0
|
||||
布店 nt 104.0
|
||||
团队 nt 104.0
|
||||
供应商 nt 103.0
|
||||
中心校 nt 102.0
|
||||
乐园 nt 101.0
|
||||
石材 nt 101.0
|
||||
茶叶 nt 101.0
|
||||
车务段 nt 101.0
|
||||
邮电局 nt 100.0
|
||||
农资 nt 100.0
|
||||
石油城 nt 99.0
|
||||
出租 nt 99.0
|
||||
餐馆 nt 98.0
|
||||
网站 nt 98.0
|
||||
门诊 nt 98.0
|
||||
鸡场 nt 98.0
|
||||
舍 nt 97.0
|
||||
乐器 nt 94.0
|
||||
宣传部 nt 94.0
|
||||
股份 nt 93.0
|
||||
代表 nt 92.0
|
||||
北京 nt 92.0
|
||||
系统 nt 91.0
|
||||
铺子 nt 91.0
|
||||
图书馆 nt 91.0
|
||||
名称 nt 91.0
|
||||
缫丝厂 nt 91.0
|
||||
职业中学 nt 91.0
|
||||
服务所 nt 91.0
|
||||
供应科 nt 90.0
|
||||
汽车 nt 90.0
|
||||
制药 nt 90.0
|
||||
光电 nt 89.0
|
||||
花园 nt 89.0
|
||||
工场 nt 89.0
|
||||
购物 nt 88.0
|
||||
仪器 nt 88.0
|
||||
畜牧场 nt 87.0
|
||||
教研室 nt 87.0
|
||||
寻呼台 nt 87.0
|
||||
房地产 nt 87.0
|
||||
电台 nt 86.0
|
||||
种畜场 nt 86.0
|
||||
粮食局 nt 85.0
|
||||
家园 nt 85.0
|
||||
商务处 nt 85.0
|
||||
沙场 nt 85.0
|
||||
苗木 nt 85.0
|
||||
热水器 nt 84.0
|
||||
支店 nt 84.0
|
||||
装潢 nt 84.0
|
||||
自动化 nt 83.0
|
||||
货运站 nt 83.0
|
||||
理工学院 nt 83.0
|
||||
家私 nt 83.0
|
||||
汽车站 nt 83.0
|
||||
零售 nt 83.0
|
||||
批发商 nt 83.0
|
||||
食品部 nt 82.0
|
||||
门诊部 nt 82.0
|
||||
铜矿 nt 82.0
|
||||
报社 nt 81.0
|
||||
机务段 nt 81.0
|
||||
鹿场 nt 80.0
|
||||
麻纺厂 nt 80.0
|
||||
发行部 nt 80.0
|
||||
基业 nt 80.0
|
||||
加盟 nt 80.0
|
||||
传播 nt 80.0
|
||||
服装城 nt 80.0
|
||||
画室 nt 79.0
|
||||
塑料 nt 79.0
|
||||
林业 nt 79.0
|
||||
小家电 nt 79.0
|
||||
歌舞厅 nt 78.0
|
||||
珠宝 nt 78.0
|
||||
钻井队 nt 78.0
|
||||
产业 nt 77.0
|
||||
服务网 nt 77.0
|
||||
商业城 nt 77.0
|
||||
耗材 nt 77.0
|
||||
艺术馆 nt 76.0
|
||||
酒吧 nt 76.0
|
||||
沥青厂 nt 76.0
|
||||
展览会 nt 75.0
|
||||
供应点 nt 75.0
|
||||
摊床 nt 74.0
|
||||
二手车 nt 74.0
|
||||
技校 nt 74.0
|
||||
电讯 nt 74.0
|
||||
生产 nt 74.0
|
||||
变电所 nt 73.0
|
||||
电梯 nt 73.0
|
||||
植保站 nt 70.0
|
||||
农经站 nt 70.0
|
||||
盐场 nt 70.0
|
||||
监测站 nt 70.0
|
||||
钟表店 nt 68.0
|
||||
彩印 nt 68.0
|
||||
小学校 nt 66.0
|
||||
招生办 nt 66.0
|
||||
网点 nt 66.0
|
||||
安装 nt 66.0
|
||||
基金会 nt 66.0
|
||||
水电站 nt 65.0
|
||||
课题组 nt 15.0
|
||||
游戏厅 nt 6.0
|
||||
航空港 nt 3.0
|
||||
师部 nt 1.0
|
||||
农校 nt 1.0
|
||||
地质队 nt 1.0
|
||||
镇 ns 13727.0
|
||||
乡 ns 12503.0
|
||||
街道 ns 4309.0
|
||||
村 ns 3266.0
|
||||
社区 ns 2100.0
|
||||
县 ns 1417.0
|
||||
胡同 ns 882.0
|
||||
区 ns 834.0
|
||||
市 ns 308.0
|
||||
城镇 ns 298.0
|
||||
山乡 ns 295.0
|
||||
苏木 ns 258.0
|
||||
居委会 ns 231.0
|
||||
村镇 ns 205.0
|
||||
道 ns 187.0
|
||||
集镇 ns 187.0
|
||||
开发区 ns 181.0
|
||||
市镇 ns 137.0
|
||||
自治县 ns 131.0
|
||||
家乡 ns 102.0
|
||||
地区 ns 91.0
|
||||
城乡 ns 81.0
|
||||
山区 ns 75.0
|
||||
城区 ns 61.0
|
||||
旗 ns 55.0
|
||||
州 ns 55.0
|
||||
水乡 ns 47.0
|
||||
东乡 ns 37.0
|
||||
街 ns 36.0
|
||||
山村 ns 35.0
|
||||
监狱 ns 33.0
|
||||
自治州 ns 30.0
|
||||
营 ns 29.0
|
||||
管理区 ns 28.0
|
||||
群岛 ns 27.0
|
||||
水库 ns 21.0
|
||||
北乡 ns 21.0
|
||||
乡镇 ns 20.0
|
||||
桥 ns 18.0
|
||||
南县 ns 18.0
|
||||
新区 ns 17.0
|
||||
古镇 ns 17.0
|
||||
民族乡 ns 17.0
|
||||
工业区 ns 16.0
|
||||
下乡 ns 16.0
|
||||
竹乡 ns 16.0
|
||||
丰县 ns 16.0
|
||||
矿区 ns 15.0
|
||||
湖 ns 14.0
|
||||
塔 ns 13.0
|
||||
东区 ns 13.0
|
||||
兴县 ns 12.0
|
||||
果乡 ns 11.0
|
||||
西村 ns 11.0
|
||||
巷 ns 11.0
|
||||
湾 ns 11.0
|
||||
市辖区 ns 10.0
|
||||
南区 ns 10.0
|
||||
家委会 ns 10.0
|
||||
庄 ns 10.0
|
||||
亭 ns 10.0
|
||||
塘 ns 9.0
|
||||
家村 ns 9.0
|
||||
泉 ns 9.0
|
||||
市区 ns 9.0
|
||||
庵 ns 9.0
|
||||
堡 ns 8.0
|
||||
劳教所 nt 8.0
|
||||
郊区 ns 8.0
|
||||
老乡 ns 8.0
|
||||
坝 ns 8.0
|
||||
王庄村 ns 8.0
|
||||
城市 ns 8.0
|
||||
村村 ns 7.0
|
||||
宁乡 ns 7.0
|
||||
沟 ns 7.0
|
||||
海区 ns 7.0
|
||||
浦 ns 7.0
|
||||
风景区 ns 7.0
|
||||
潭 ns 7.0
|
||||
官庄村 ns 7.0
|
||||
虚拟 ns 7.0
|
||||
邑 ns 6.0
|
||||
小区 ns 6.0
|
||||
关 ns 6.0
|
||||
西区 ns 6.0
|
||||
花乡 ns 6.0
|
||||
房 ns 6.0
|
||||
卡 ns 6.0
|
||||
定 ns 6.0
|
||||
岗 ns 6.0
|
||||
直镇 ns 6.0
|
||||
林区 ns 6.0
|
||||
林县 ns 6.0
|
||||
辖 ns 6.0
|
||||
特区 ns 6.0
|
||||
冈 ns 5.0
|
||||
岗区 ns 5.0
|
||||
辛店村 ns 5.0
|
||||
管区 ns 5.0
|
||||
达县 ns 5.0
|
||||
寨 ns 5.0
|
||||
新县 ns 5.0
|
||||
谷 ns 5.0
|
||||
农科所 nt 5.0
|
||||
岭 ns 5.0
|
||||
自治区 ns 5.0
|
||||
夹道 ns 5.0
|
||||
滩 ns 5.0
|
||||
坡 ns 5.0
|
||||
坪 ns 5.0
|
||||
新村 ns 5.0
|
||||
大江 ns 4.0
|
||||
桐乡 ns 4.0
|
||||
栅栏 ns 4.0
|
||||
全镇 ns 4.0
|
||||
神庙 ns 4.0
|
||||
里庄村 ns 4.0
|
||||
自然保护区 ns 4.0
|
||||
沙洲 ns 4.0
|
||||
同乡 ns 4.0
|
||||
依达乡 ns 4.0
|
||||
巴县 ns 4.0
|
||||
洲 ns 4.0
|
||||
官庄镇 ns 4.0
|
||||
路 ns 4.0
|
||||
溪口镇 ns 3.0
|
||||
垦区 ns 3.0
|
||||
乌镇 ns 3.0
|
||||
水井 ns 3.0
|
||||
景区 ns 3.0
|
||||
回族 ns 3.0
|
||||
马桥镇 ns 3.0
|
||||
公园 ns 3.0
|
||||
回乡 ns 3.0
|
||||
营区 ns 3.0
|
||||
名胜区 ns 3.0
|
||||
刘庄村 ns 3.0
|
||||
辛店镇 ns 3.0
|
||||
本乡 ns 3.0
|
||||
西亚 ns 3.0
|
||||
竹园镇 ns 3.0
|
||||
高村 ns 3.0
|
||||
北河乡 ns 2.0
|
||||
萨摩亚 ns 2.0
|
||||
竹林镇 ns 2.0
|
||||
瑶乡 ns 2.0
|
||||
拉西乡 ns 2.0
|
||||
张庄村 ns 2.0
|
||||
柏林 ns 2.0
|
||||
大门 ns 2.0
|
||||
示范区 ns 2.0
|
||||
渔乡 ns 2.0
|
||||
联邦 ns 2.0
|
||||
马桩 ns 2.0
|
||||
卡拉 ns 2.0
|
||||
站区 ns 2.0
|
||||
本镇 ns 2.0
|
||||
圣庙 ns 2.0
|
||||
大街 ns 2.0
|
||||
共和国 ns 2.0
|
||||
宽街 ns 2.0
|
||||
太平村 ns 2.0
|
||||
开县 ns 2.0
|
||||
庙街 ns 2.0
|
||||
杨村 ns 2.0
|
||||
苏州 ns 2.0
|
||||
西庄村 ns 2.0
|
||||
重镇 ns 2.0
|
||||
农区 ns 2.0
|
||||
水口镇 ns 2.0
|
||||
岸区 ns 2.0
|
||||
西沟村 ns 2.0
|
||||
官园 ns 2.0
|
||||
菜园 ns 2.0
|
||||
开发办 ns 2.0
|
||||
保税区 ns 2.0
|
||||
试验区 ns 2.0
|
||||
桃园 ns 2.0
|
||||
文县 ns 2.0
|
||||
全县 ns 2.0
|
||||
岔河镇 ns 2.0
|
||||
宿县 ns 2.0
|
||||
易县 ns 1.0
|
||||
洋县 ns 1.0
|
||||
华里 ns 1.0
|
||||
阿图什 ns 1.0
|
||||
城西乡 ns 1.0
|
||||
布市 ns 1.0
|
||||
天池 ns 1.0
|
||||
坎市 ns 1.0
|
||||
钓鱼台 ns 1.0
|
||||
海淀区 ns 1.0
|
||||
通州区 ns 1.0
|
||||
土沟村 ns 1.0
|
||||
文昌阁 ns 1.0
|
||||
聂庄村 ns 1.0
|
||||
西城区 ns 1.0
|
||||
密云县 ns 1.0
|
||||
唐庄镇 ns 1.0
|
||||
返乡 ns 1.0
|
||||
炒面 ns 1.0
|
||||
黄村 ns 1.0
|
||||
吉祥村 ns 1.0
|
||||
行政区 ns 1.0
|
||||
塘沽区 ns 1.0
|
||||
市直 ns 1.0
|
||||
邱县 ns 1.0
|
||||
农村 ns 1.0
|
||||
海域 ns 1.0
|
||||
沙湾镇 ns 1.0
|
||||
南里 ns 1.0
|
||||
花市 ns 1.0
|
||||
渠县 ns 1.0
|
||||
滦县 ns 1.0
|
||||
并入 ns 1.0
|
||||
威县 ns 1.0
|
||||
后河乡 ns 1.0
|
||||
晋安区 ns 1.0
|
||||
酋长国 ns 1.0
|
||||
城口县 ns 1.0
|
||||
渡口 ns 1.0
|
||||
思乡 ns 1.0
|
||||
达科他州 ns 1.0
|
||||
西青区 ns 1.0
|
||||
新罗区 ns 1.0
|
||||
江北区 ns 1.0
|
||||
宣武区 ns 1.0
|
||||
徐汇区 ns 1.0
|
||||
茅山 ns 1.0
|
||||
松岗镇 ns 1.0
|
||||
大河乡 ns 1.0
|
||||
筒子 ns 1.0
|
||||
黄浦区 ns 1.0
|
||||
门头沟区 ns 1.0
|
||||
石门镇 ns 1.0
|
||||
呼和浩特 ns 1.0
|
||||
寺沟乡 ns 1.0
|
||||
塘桥镇 ns 1.0
|
||||
太平镇 ns 1.0
|
||||
渔港 ns 1.0
|
||||
上坡 ns 1.0
|
||||
马銮湾 ns 1.0
|
||||
营房 ns 1.0
|
||||
边区 ns 1.0
|
||||
比尔 ns 1.0
|
||||
蓟县 ns 1.0
|
||||
东江镇 ns 1.0
|
||||
石景山区 ns 1.0
|
||||
太仓 ns 1.0
|
||||
官厅 ns 1.0
|
||||
市郊 ns 1.0
|
||||
伦敦 ns 1.0
|
||||
津南区 ns 1.0
|
||||
小镇 ns 1.0
|
||||
西固区 ns 1.0
|
||||
四平乡 ns 1.0
|
||||
水头乡 ns 1.0
|
||||
马普托 ns 1.0
|
||||
场区 ns 1.0
|
||||
闵行区 ns 1.0
|
||||
龙头乡 ns 1.0
|
||||
港口 ns 1.0
|
||||
长宁区 ns 1.0
|
||||
北辰区 ns 1.0
|
||||
梅山镇 ns 1.0
|
||||
仓山区 ns 1.0
|
||||
澳洲 ns 1.0
|
||||
萍乡 ns 1.0
|
||||
嘉定区 ns 1.0
|
||||
区域 ns 1.0
|
||||
沧县 ns 1.0
|
||||
卡子 ns 1.0
|
||||
河西区 ns 1.0
|
||||
渝中区 ns 1.0
|
||||
柳行镇 ns 1.0
|
||||
大湖镇 ns 1.0
|
||||
达拉特旗 ns 1.0
|
||||
后身 ns 1.0
|
||||
灌区 ns 1.0
|
||||
红桥区 ns 1.0
|
||||
西伯利亚 ns 1.0
|
||||
南开区 ns 1.0
|
||||
贸易区 ns 1.0
|
||||
村委 ns 1.0
|
||||
茂南区 ns 1.0
|
||||
常山县 ns 1.0
|
||||
海南 ns 1.0
|
||||
草场 ns 1.0
|
||||
河东区 ns 1.0
|
||||
常山 ns 1.0
|
||||
坪坝 ns 1.0
|
||||
口岸 ns 1.0
|
||||
大栅栏 ns 1.0
|
||||
草坪 ns 1.0
|
||||
安达 ns 1.0
|
||||
锦旗 ns 1.0
|
||||
黄县 ns 1.0
|
||||
泰州市 ns 1.0
|
||||
东城区 ns 1.0
|
||||
南市 ns 1.0
|
||||
河流镇 ns 1.0
|
||||
宁河县 ns 1.0
|
||||
亚尔乡 ns 1.0
|
||||
奉节县 ns 1.0
|
||||
道口 ns 1.0
|
||||
鼓楼区 ns 1.0
|
||||
巫山县 ns 1.0
|
||||
和平区 ns 1.0
|
||||
延庆县 ns 1.0
|
||||
小街 ns 1.0
|
||||
海岸 ns 1.0
|
||||
屯河 ns 1.0
|
||||
丰台区 ns 1.0
|
||||
杨浦区 ns 1.0
|
||||
梁平县 ns 1.0
|
||||
苗乡 ns 1.0
|
||||
普陀区 ns 1.0
|
||||
南园 ns 1.0
|
||||
义县 ns 1.0
|
||||
长安镇 ns 1.0
|
||||
大足县 ns 1.0
|
||||
管教所 nt 1.0
|
||||
鸽镇 ns 1.0
|
||||
朝阳区 ns 1.0
|
||||
东兴 ns 1.0
|
||||
大营子镇 ns 1.0
|
||||
石桥 ns 1.0
|
||||
泰州 ns 1.0
|
||||
呼和浩特市 ns 1.0
|
||||
运河 ns 1.0
|
||||
白旗 ns 1.0
|
||||
长岭 ns 1.0
|
||||
水泡 ns 1.0
|
||||
泥河镇 ns 1.0
|
||||
沁县 ns 1.0
|
||||
河北区 ns 1.0
|
||||
胡兰镇 ns 1.0
|
||||
寿县 ns 1.0
|
||||
岛礁 ns 1.0
|
||||
崇明县 ns 1.0
|
||||
忠县 ns 1.0
|
||||
南街 ns 1.0
|
||||
达斡尔族 ns 1.0
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,51 +0,0 @@
|
|||
0 5
|
||||
1 5
|
||||
2 5
|
||||
3 5
|
||||
4 5
|
||||
5 5
|
||||
6 5
|
||||
7 5
|
||||
8 5
|
||||
9 5
|
||||
0 5
|
||||
1 5
|
||||
2 5
|
||||
3 5
|
||||
4 5
|
||||
5 5
|
||||
6 5
|
||||
7 5
|
||||
8 5
|
||||
9 5
|
||||
% 5
|
||||
零 5
|
||||
一 5
|
||||
二 5
|
||||
三 5
|
||||
四 5
|
||||
五 5
|
||||
六 5
|
||||
七 5
|
||||
八 5
|
||||
九 5
|
||||
十 5
|
||||
百 5
|
||||
千 5
|
||||
万 5
|
||||
亿 5
|
||||
兆 5
|
||||
零 5
|
||||
壹 5
|
||||
贰 5
|
||||
叁 5
|
||||
肆 5
|
||||
伍 5
|
||||
陆 5
|
||||
柒 5
|
||||
捌 5
|
||||
玖 5
|
||||
拾 5
|
||||
佰 5
|
||||
仟 5
|
||||
. 5
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -1,48 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
package org.deeplearning4j.text.tokenization.tokenizer;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.util.*;
|
||||
|
||||
import org.deeplearning4j.BaseDL4JTest;
|
||||
import org.nd4j.common.tests.AbstractAssertTestsClass;
|
||||
|
||||
@Slf4j
|
||||
public class AssertTestsExtendBaseClass extends AbstractAssertTestsClass {
|
||||
|
||||
@Override
|
||||
protected Set<Class<?>> getExclusions() {
|
||||
Set<Class<?>> exclusions = new HashSet<>();
|
||||
return exclusions;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getPackageName() {
|
||||
return "org.deeplearning4j";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Class<?> getBaseClass() {
|
||||
return BaseDL4JTest.class;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
/*
|
||||
* ******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* * This program and the accompanying materials are made available under the
|
||||
* * terms of the Apache License, Version 2.0 which is available at
|
||||
* * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
* *
|
||||
* * See the NOTICE file distributed with this work for additional
|
||||
* * information regarding copyright ownership.
|
||||
* * Unless required by applicable law or agreed to in writing, software
|
||||
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* * License for the specific language governing permissions and limitations
|
||||
* * under the License.
|
||||
* *
|
||||
* * SPDX-License-Identifier: Apache-2.0
|
||||
* *****************************************************************************
|
||||
*/
|
||||
|
||||
package org.deeplearning4j.text.tokenization.tokenizer;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.deeplearning4j.BaseDL4JTest;
|
||||
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
|
||||
import org.deeplearning4j.models.word2vec.Word2Vec;
|
||||
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
|
||||
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
|
||||
import org.deeplearning4j.nlp.chinese.tokenization.tokenizerFactory.ChineseTokenizerFactory;
|
||||
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
@Slf4j
|
||||
public class ChineseTokenizerTest extends BaseDL4JTest {
|
||||
|
||||
private final String toTokenize = "青山绿水和伟大的科学家让世界更美好和平";
|
||||
private final String[] expect = {"青山绿水", "和", "伟大", "的", "科学家", "让", "世界", "更", "美好", "和平"};
|
||||
|
||||
@Test
|
||||
public void testChineseTokenizer() {
|
||||
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
|
||||
Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
|
||||
assertEquals(expect.length, tokenizer.countTokens());
|
||||
for (int i = 0; i < tokenizer.countTokens(); ++i) {
|
||||
assertEquals(tokenizer.nextToken(), expect[i]);
|
||||
}
|
||||
}
|
||||
|
||||
//Train model by some data of the chinese names,Then find out the names from the dataset
|
||||
@Ignore
|
||||
@Test
|
||||
public void testFindNamesFromText() throws IOException {
|
||||
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");
|
||||
|
||||
log.info("load is right!");
|
||||
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
|
||||
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());
|
||||
|
||||
//Generates a word-vector from the dataset stored in resources folder
|
||||
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
|
||||
.learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
|
||||
vec.fit();
|
||||
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));
|
||||
|
||||
//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
|
||||
// WordVectors wordVectors;
|
||||
|
||||
//test model,Whether the model find out name from unknow text;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,78 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ /* ******************************************************************************
|
||||
~ *
|
||||
~ *
|
||||
~ * This program and the accompanying materials are made available under the
|
||||
~ * terms of the Apache License, Version 2.0 which is available at
|
||||
~ * https://www.apache.org/licenses/LICENSE-2.0.
|
||||
~ *
|
||||
~ * See the NOTICE file distributed with this work for additional
|
||||
~ * information regarding copyright ownership.
|
||||
~ * Unless required by applicable law or agreed to in writing, software
|
||||
~ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
~ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
~ * License for the specific language governing permissions and limitations
|
||||
~ * under the License.
|
||||
~ *
|
||||
~ * SPDX-License-Identifier: Apache-2.0
|
||||
~ ******************************************************************************/
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-nlp-parent</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>deeplearning4j-nlp-japanese</artifactId>
|
||||
|
||||
<properties>
|
||||
<kuromoji.version>0.9.0</kuromoji.version>
|
||||
<randomizedtesting.version>2.1.16</randomizedtesting.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
<!--<dependency>-->
|
||||
<!--<groupId>com.atilika.kuromoji</groupId>-->
|
||||
<!--<artifactId>kuromoji-ipadic</artifactId>-->
|
||||
<!--<version>${kuromoji.version}</version>-->
|
||||
<!--<type>jar</type>-->
|
||||
<!--<scope>compile</scope>-->
|
||||
<!--</dependency>-->
|
||||
<dependency>
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-nlp</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
||||
<artifactId>randomizedtesting-runner</artifactId>
|
||||
<version>${randomizedtesting.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>test-nd4j-native</id>
|
||||
</profile>
|
||||
<profile>
|
||||
<id>test-nd4j-cuda-11.0</id>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue