Lapis-Hong
diff --git a/‎README.md
Lines changed: 69 additions & 20 deletions b/‎README.md
Lines changed: 69 additions & 20 deletions
diff --git a/‎examples/find_newwords_example.py
Lines changed: 30 additions & 0 deletions b/‎examples/find_newwords_example.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎run.sh
Lines changed: 24 additions & 0 deletions b/‎run.sh
Lines changed: 24 additions & 0 deletions
diff --git a/‎setup.py
Lines changed: 26 additions & 16 deletions b/‎setup.py
Lines changed: 26 additions & 16 deletions
diff --git a/‎test/dictionary_test.py
Lines changed: 1 addition & 1 deletion b/‎test/dictionary_test.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/word_extraction_test.py
Lines changed: 2 additions & 2 deletions b/‎test/word_extraction_test.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎user.dic
Lines changed: 0 additions & 1 deletion b/‎user.dic
Lines changed: 0 additions & 1 deletion
diff --git a/‎xinci/__init__.py
Lines changed: 13 additions & 8 deletions b/‎xinci/__init__.py
Lines changed: 13 additions & 8 deletions
@@ -1,23 +1,28 @@
-# xinci 
-## 新词发现 Chinese Words Extraction & New Words Finder (python package).
+# xinci 新词 & 抽词
+xinci is a Python interface for chinese words extraction & new words extraction.
+[https://pypi.org/project/xinci/]
 
+## Requirements
+Python >= 2.7
 
-## install
+## Installation
 ### 1. using pip
-```
+```shell
 pip install xinci
 ```
 ### 2. using setup.py
-``` 
+``` shell
 git clone git@github.com:Lapis-Hong/xinci.git  
 cd xinci 
 pip setup.py install
 ```
 
-## Usage:
+## Usage
+This package has two main use cases: words extraction and
+find new words. 
 
 ### 1. command line
-```
+```shell
 cd xinci
 python word_extraction.py 
 ```
@@ -26,22 +31,66 @@ or
 ./run.sh
 ```
 
-### 2. as a python package
-```
->>> import xinci
-# modify common word dic, add words or remove words
+### 2. python package
+```python 
+import xinci
+
+# if you want to see logging events.
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s')
+
+# init default dictionary or user dic,
 dic = xinci.Dictionary()
-dic.add(['神马']
-dic.add_from_file('user.dic')
+# load vocab, vocab is a python set.
+vocab = dic.load()  # or dic.dictionary
+print(vocab)
 
-dic.remove(['神马']
-dic.remove_from_file('user.dic')
+# add words to dic
+dic.add(['神马'])  # or dic.add_from_file('user.dic')
+# remove words from dic
+dic.remove(['神马'])  # or dic.remove_from_file('user.dic')
 
-# find new words
-we = xinci.WordExtract('corpus.txt')
-we.extract()
+# extract new words, xc is a set
+xc = xinci.extract('corpus.txt')
+for w in xc:
+    print(w)
+# extract all words, c is a set
+c = xinci.extract('corpus.txt', all_words=True)
+for w in xc:
+    print(w)
+```
+result
+```angular2html
+发现5个新词如下:
+@新词	@词频
+祛斑	13
+后再	7
+今日头条	9
+洗净切	7
+蛋液	9
+```
+### Notes: Iteratively add "not seems to new words" in result to common dic will improve a lot. 
+
+
+## API documentation
+```python
+xc = xinci.extract(params)
+
+```
+List of available `params` and their default value:
+```angular2html
+corpus_file:           string, input corpus file (required)
+common_words_file:     string, common words dic file [common.dic]
+min_candidate_len:     int, min candidate word length [2]
+max_candidate_len:     int, max candidate word length [5]
+least_cnt_threshold:   int, least word count to extract [5]
+solid_rate_threshold:  float, solid rate threshold [0.018]
+entropy_threshold:     float, entropy threshold [1.92]
+all_words: bool,       set True to extract all words mode [False]
+save_file: string,     output file [None]
 ```
 
-The code is based on follow java version
-https://github.com/GeorgeBourne/grid
+## References
+The code is based on this java version
+[https://github.com/GeorgeBourne/grid]
 
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# coding: utf-8
+# @Author: lapis-hong
+# @Date  : 2018/6/19
+import xinci
+
+# if you want to see logging events.
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s : %(levelname)s : %(message)s')
+
+# init default dictionary or user dic,
+dic = xinci.Dictionary()
+# load vocab, vocab is a python set.
+vocab = dic.load()  # or dic.dictionary
+# print(vocab)
+
+# add words to dic
+dic.add(['lll'])  # or dic.add_from_file('user.dic')
+# remove words from dic
+dic.remove(['lll'])  # or dic.remove_from_file('user.dic')
+
+# extract new words, xc is a set
+xc = xinci.extract('../xinci/test.txt')
+for w in xc:
+    print(w)
+
+# extract all words, c is a set
+c = xinci.extract('../xinci/test.txt', all_words=True)
+for w in xc:
+    print(w)
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# The script run the xinci program.
+# Usage:
+#		1. bash run.sh  $corpus_file                 # use default common words dic
+#       2. bash run.sh  $corpus_file $user_dic_file  # pass 2nd argument to load user common words dic
+set -e
+
+show_usage() {
+    echo 'Usage: bash run.sh $corpus_file [$user_dic_file]'
+}
+
+
+
+cd xinci
+
+if [ $# == 0 ]; then
+    show_usage
+elif [ $# == 1 ]; then
+    python word_extraction.py -f $1
+elif [ $# == 2 ]; then
+    python word_extraction.py -f $1 -d $2
+fi
+
@@ -5,28 +5,38 @@
 # from distutils.core import setup
 from setuptools import setup, find_packages
 
+
 setup(name="xinci",
-      version="1.0",
-      description="Chinese words extraction and find new words",
+      version="1.1.0",
+      description="Chinese words extraction and new words discovery",
+      long_description=open("README.md", "r").read(),
+      long_description_content_type="text/markdown",
       author="Lapis-Hong",
       author_email="dinghongquan@sjtu.edu.cn",
-      url="https://github.com/lapis-hong/new-words-finder",
+      url="https://github.com/lapis-hong/xinci",
       license="MIT",
       keywords='NLP, Chinese words extraction, New words discovery',
-        # Name the folder where your packages live:
-        # (If you have other packages (dirs) or modules (py files) then
-        # put them into the package directory - they will be found
-        # recursively.)
-      packages=['xinci'],
-        # 'package' package must contain files (see list above)
-        # I called the package 'package' thus cleverly confusing the whole issue...
-        # This dict maps the package name =to=> directories
-        # It says, package *needs* these files.
+      # Name the folder where your packages live:
+      # (If you have other packages (dirs) or modules (py files) then
+      # put them into the package directory - they will be found
+      # recursively.)
+      packages=find_packages(),
+      # 'package' package must contain files (see list above)
+      # I called the package 'package' thus cleverly confusing the whole issue...
+      # This dict maps the package name =to=> directories
+      # It says, package *needs* these files.
       package_dir={'xinci': 'xinci'},
-      package_data={'package': 'xinci'},
-        # 'runner' is in the root.
+      package_data={'xinci': ['*.*']},  # include common.dic
+      # 'runner' is in the root.
       # scripts=["runner"],
-      long_description="""Really long text here.""",
       # This next part it for the Cheese Shop, look a little down the page.
-      classifiers = []
+      classifiers=[
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence'
+      ]
 )
@@ -12,7 +12,7 @@ class TestDictionary(unittest.TestCase):
     pass
 
 if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)  # 设置日志级别
+    # logging.basicConfig(level=logging.INFO)  # 设置日志级别
     dic = Dictionary()
     for w in dic:
         print(w)
 
@@ -9,7 +9,7 @@
 from xinci.word_extraction import WordExtract
 
 if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)  # 设置日志级别
+    logging.basicConfig(level=logging.INFO)
     new_word_finder = WordExtract('../xinci/test.txt', '../xinci/common.dic')
     new_word_finder.extract('../result.txt')
-    # cProfile.run('new_word_finder.extract()')
+    cProfile.run('new_word_finder.extract()')
@@ -2,19 +2,24 @@
 # coding: utf-8
 # @Author: lapis-hong
 # @Date  : 2018/6/18
-"""This package contains interfaces and functionality. """
-from __future__ import absolute_import
+"""This package contains interfaces and functionality to xinci. """
+# from __future__ import absolute_import
 from __future__ import unicode_literals
-import logging
 
+from .dictionary import Dictionary
+from .word_extraction import extract
+
+
+__version__ = '1.1.0'
+
+__all__ = [Dictionary, extract]
+
+
+# if len(logger.handlers) == 0:  # To ensure reload() doesn't add another one
+#     logger.addHandler(logging.NullHandler())
 
-from xinci.dictionary import Dictionary
-from xinci.word_extraction import WordExtract
 
 
-__version__ = '1.0'
 
-__all__ = []
 
 
-logger = logging.getLogger('xinci')