Skip to content

Commit 629f8e9

Browse files
author
lapis-hong
committed
fix bugs
1 parent 70f54d1 commit 629f8e9

14 files changed

+48005
-47833
lines changed

README.md

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,28 @@
1-
# xinci
2-
## 新词发现 Chinese Words Extraction & New Words Finder (python package).
1+
# xinci 新词 & 抽词
2+
xinci is a Python interface for chinese words extraction & new words extraction.
3+
[https://pypi.org/project/xinci/]
34

5+
## Requirements
6+
Python >= 2.7
47

5-
## install
8+
## Installation
69
### 1. using pip
7-
```
10+
```shell
811
pip install xinci
912
```
1013
### 2. using setup.py
11-
```
14+
``` shell
1215
git clone git@github.com:Lapis-Hong/xinci.git
1316
cd xinci
1417
pip setup.py install
1518
```
1619

17-
## Usage:
20+
## Usage
21+
This package has two main use cases: words extraction and
22+
find new words.
1823

1924
### 1. command line
20-
```
25+
```shell
2126
cd xinci
2227
python word_extraction.py
2328
```
@@ -26,22 +31,66 @@ or
2631
./run.sh
2732
```
2833

29-
### 2. as a python package
30-
```
31-
>>> import xinci
32-
# modify common word dic, add words or remove words
34+
### 2. python package
35+
```python
36+
import xinci
37+
38+
# if you want to see logging events.
39+
import logging
40+
logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s')
41+
42+
# init default dictionary or user dic,
3343
dic = xinci.Dictionary()
34-
dic.add(['神马']
35-
dic.add_from_file('user.dic')
44+
# load vocab, vocab is a python set.
45+
vocab = dic.load() # or dic.dictionary
46+
print(vocab)
3647

37-
dic.remove(['神马']
38-
dic.remove_from_file('user.dic')
48+
# add words to dic
49+
dic.add(['神马']) # or dic.add_from_file('user.dic')
50+
# remove words from dic
51+
dic.remove(['神马']) # or dic.remove_from_file('user.dic')
3952

40-
# find new words
41-
we = xinci.WordExtract('corpus.txt')
42-
we.extract()
53+
# extract new words, xc is a set
54+
xc = xinci.extract('corpus.txt')
55+
for w in xc:
56+
print(w)
57+
# extract all words, c is a set
58+
c = xinci.extract('corpus.txt', all_words=True)
59+
for w in xc:
60+
print(w)
61+
```
62+
result
63+
```angular2html
64+
发现5个新词如下:
65+
@新词 @词频
66+
祛斑 13
67+
后再 7
68+
今日头条 9
69+
洗净切 7
70+
蛋液 9
71+
```
72+
### Notes: Iteratively add "not seems to new words" in result to common dic will improve a lot.
73+
74+
75+
## API documentation
76+
```python
77+
xc = xinci.extract(params)
78+
79+
```
80+
List of available `params` and their default value:
81+
```angular2html
82+
corpus_file: string, input corpus file (required)
83+
common_words_file: string, common words dic file [common.dic]
84+
min_candidate_len: int, min candidate word length [2]
85+
max_candidate_len: int, max candidate word length [5]
86+
least_cnt_threshold: int, least word count to extract [5]
87+
solid_rate_threshold: float, solid rate threshold [0.018]
88+
entropy_threshold: float, entropy threshold [1.92]
89+
all_words: bool, set True to extract all words mode [False]
90+
save_file: string, output file [None]
4391
```
4492

45-
The code is based on follow java version
46-
https://github.com/GeorgeBourne/grid
93+
## References
94+
The code is based on this java version
95+
[https://github.com/GeorgeBourne/grid]
4796

examples/find_newwords_example.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
# @Author: lapis-hong
4+
# @Date : 2018/6/19
5+
import xinci
6+
7+
# if you want to see logging events.
8+
import logging
9+
logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s')
10+
11+
# init default dictionary or user dic,
12+
dic = xinci.Dictionary()
13+
# load vocab, vocab is a python set.
14+
vocab = dic.load() # or dic.dictionary
15+
# print(vocab)
16+
17+
# add words to dic
18+
dic.add(['lll']) # or dic.add_from_file('user.dic')
19+
# remove words from dic
20+
dic.remove(['lll']) # or dic.remove_from_file('user.dic')
21+
22+
# extract new words, xc is a set
23+
xc = xinci.extract('../xinci/test.txt')
24+
for w in xc:
25+
print(w)
26+
27+
# extract all words, c is a set
28+
c = xinci.extract('../xinci/test.txt', all_words=True)
29+
for w in xc:
30+
print(w)

run.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env bash
2+
3+
# The script run the xinci program.
4+
# Usage:
5+
# 1. bash run.sh $corpus_file # use default common words dic
6+
# 2. bash run.sh $corpus_file $user_dic_file # pass 2nd argument to load user common words dic
7+
set -e
8+
9+
show_usage() {
10+
echo 'Usage: bash run.sh $corpus_file [$user_dic_file]'
11+
}
12+
13+
14+
15+
cd xinci
16+
17+
if [ $# == 0 ]; then
18+
show_usage
19+
elif [ $# == 1 ]; then
20+
python word_extraction.py -f $1
21+
elif [ $# == 2 ]; then
22+
python word_extraction.py -f $1 -d $2
23+
fi
24+

setup.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,28 +5,38 @@
55
# from distutils.core import setup
66
from setuptools import setup, find_packages
77

8+
89
setup(name="xinci",
9-
version="1.0",
10-
description="Chinese words extraction and find new words",
10+
version="1.1.0",
11+
description="Chinese words extraction and new words discovery",
12+
long_description=open("README.md", "r").read(),
13+
long_description_content_type="text/markdown",
1114
author="Lapis-Hong",
1215
author_email="dinghongquan@sjtu.edu.cn",
13-
url="https://github.com/lapis-hong/new-words-finder",
16+
url="https://github.com/lapis-hong/xinci",
1417
license="MIT",
1518
keywords='NLP, Chinese words extraction, New words discovery',
16-
# Name the folder where your packages live:
17-
# (If you have other packages (dirs) or modules (py files) then
18-
# put them into the package directory - they will be found
19-
# recursively.)
20-
packages=['xinci'],
21-
# 'package' package must contain files (see list above)
22-
# I called the package 'package' thus cleverly confusing the whole issue...
23-
# This dict maps the package name =to=> directories
24-
# It says, package *needs* these files.
19+
# Name the folder where your packages live:
20+
# (If you have other packages (dirs) or modules (py files) then
21+
# put them into the package directory - they will be found
22+
# recursively.)
23+
packages=find_packages(),
24+
# 'package' package must contain files (see list above)
25+
# I called the package 'package' thus cleverly confusing the whole issue...
26+
# This dict maps the package name =to=> directories
27+
# It says, package *needs* these files.
2528
package_dir={'xinci': 'xinci'},
26-
package_data={'package': 'xinci'},
27-
# 'runner' is in the root.
29+
package_data={'xinci': ['*.*']}, # include common.dic
30+
# 'runner' is in the root.
2831
# scripts=["runner"],
29-
long_description="""Really long text here.""",
3032
# This next part it for the Cheese Shop, look a little down the page.
31-
classifiers = []
33+
classifiers=[
34+
'Programming Language :: Python :: 2.7',
35+
'Programming Language :: Python :: 3.2',
36+
'Programming Language :: Python :: 3.3',
37+
'Programming Language :: Python :: 3.4',
38+
'Programming Language :: Python :: 3.5',
39+
'Programming Language :: Python :: 3.6',
40+
'Topic :: Scientific/Engineering :: Artificial Intelligence'
41+
]
3242
)

test/dictionary_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class TestDictionary(unittest.TestCase):
1212
pass
1313

1414
if __name__ == '__main__':
15-
logging.basicConfig(level=logging.INFO) # 设置日志级别
15+
# logging.basicConfig(level=logging.INFO) # 设置日志级别
1616
dic = Dictionary()
1717
for w in dic:
1818
print(w)

test/word_extraction_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from xinci.word_extraction import WordExtract
1010

1111
if __name__ == '__main__':
12-
logging.basicConfig(level=logging.INFO) # 设置日志级别
12+
logging.basicConfig(level=logging.INFO)
1313
new_word_finder = WordExtract('../xinci/test.txt', '../xinci/common.dic')
1414
new_word_finder.extract('../result.txt')
15-
# cProfile.run('new_word_finder.extract()')
15+
cProfile.run('new_word_finder.extract()')

user.dic

Lines changed: 0 additions & 1 deletion
This file was deleted.

xinci/__init__.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,24 @@
22
# coding: utf-8
33
# @Author: lapis-hong
44
# @Date : 2018/6/18
5-
"""This package contains interfaces and functionality. """
6-
from __future__ import absolute_import
5+
"""This package contains interfaces and functionality to xinci. """
6+
# from __future__ import absolute_import
77
from __future__ import unicode_literals
8-
import logging
98

9+
from .dictionary import Dictionary
10+
from .word_extraction import extract
11+
12+
13+
__version__ = '1.1.0'
14+
15+
__all__ = [Dictionary, extract]
16+
17+
18+
# if len(logger.handlers) == 0: # To ensure reload() doesn't add another one
19+
# logger.addHandler(logging.NullHandler())
1020

11-
from xinci.dictionary import Dictionary
12-
from xinci.word_extraction import WordExtract
1321

1422

15-
__version__ = '1.0'
1623

17-
__all__ = []
1824

1925

20-
logger = logging.getLogger('xinci')

0 commit comments

Comments
 (0)