diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 101176e..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore index 60ae05e..be607a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,259 +1,5 @@ - -# User-specific files -*.suo -*.user -*.userosscache -*.sln.docstates - -# User-specific files (MonoDevelop/Xamarin Studio) -*.userprefs - -# Build results -[Dd]ebug/ -[Dd]ebugPublic/ -[Rr]elease/ -[Rr]eleases/ -x64/ -x86/ -bld/ -[Bb]in/ -[Oo]bj/ -[Ll]og/ - -# Visual Studio 2015 cache/options directory -.vs/ -# Uncomment if you have tasks that create the project's static files in wwwroot -#wwwroot/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -# NUNIT -*.VisualState.xml -TestResult.xml - -# Build Results of an ATL Project -[Dd]ebugPS/ -[Rr]eleasePS/ -dlldata.c - -# DNX -project.lock.json -project.fragment.lock.json -artifacts/ - -*_i.c -*_p.c -*_i.h -*.ilk -*.meta -*.obj -*.pch -*.pdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.svclog -*.scc - -# Chutzpah Test files -_Chutzpah* - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opendb -*.opensdf -*.sdf -*.cachefile -*.VC.db -*.VC.VC.opendb - -# Visual Studio profiler -*.psess -*.vsp -*.vspx -*.sap - -# TFS 2012 Local Workspace -$tf/ - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper -*.DotSettings.user - -# JustCode is a .NET coding add-in -.JustCode - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# NCrunch -_NCrunch_* -.*crunch*.local.xml -nCrunchTemp_* - -# MightyMoose -*.mm.* -AutoTest.Net/ - -# Web workbench (sass) -.sass-cache/ - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.[Pp]ublish.xml -*.azurePubxml -# TODO: Comment the next line if you want to checkin your web deploy settings -# but database connection strings (with potential passwords) will be unencrypted -#*.pubxml -*.publishproj - -# Microsoft Azure Web App publish settings. Comment the next line if you want to -# checkin your Azure Web App publish settings, but sensitive information contained -# in these scripts will be unencrypted -PublishScripts/ - -# NuGet Packages -*.nupkg -# The packages folder can be ignored because of Package Restore -**/packages/* -# except build/, which is used as an MSBuild target. -!**/packages/build/ -# Uncomment if necessary however generally it will be regenerated when needed -#!**/packages/repositories.config -# NuGet v3's project.json files produces more ignoreable files -*.nuget.props -*.nuget.targets - -# Microsoft Azure Build Output -csx/ -*.build.csdef - -# Microsoft Azure Emulator -ecf/ -rcf/ - -# Windows Store app package directories and files -AppPackages/ -BundleArtifacts/ -Package.StoreAssociation.xml -_pkginfo.txt - -# Visual Studio cache files -# files ending in .cache can be ignored -*.[Cc]ache -# but keep track of directories ending in .cache -!*.[Cc]ache/ - -# Others -ClientBin/ -~$* -*~ -*.dbmdl -*.dbproj.schemaview -*.jfm -*.pfx -*.publishsettings -node_modules/ -orleans.codegen.cs - -# Since there are multiple workflows, uncomment next line to ignore bower_components -# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) -#bower_components/ - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file -# to a newer Visual Studio version. Backup files are not needed, -# because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm - -# SQL Server files -*.mdf -*.ldf - -# Business Intelligence projects -*.rdl.data -*.bim.layout -*.bim_*.settings - -# Microsoft Fakes -FakesAssemblies/ - -# GhostDoc plugin setting file -*.GhostDoc.xml - -# Node.js Tools for Visual Studio -.ntvs_analysis.dat - -# Visual Studio 6 build log -*.plg - -# Visual Studio 6 workspace options file -*.opt - -# Visual Studio LightSwitch build output -**/*.HTMLClient/GeneratedArtifacts -**/*.DesktopClient/GeneratedArtifacts -**/*.DesktopClient/ModelManifest.xml -**/*.Server/GeneratedArtifacts -**/*.Server/ModelManifest.xml -_Pvt_Extensions - -# Paket dependency manager -.paket/paket.exe -paket-files/ - -# FAKE - F# Make -.fake/ - -# JetBrains Rider -.idea/ -*.sln.iml - -# CodeRush -.cr/ - -# Python Tools for Visual Studio (PTVS) -__pycache__/ -*.pyc +/.vs +**/obj +**/bin +**/PublishProfiles +*.user \ No newline at end of file diff --git a/Analyser/.DS_Store b/Analyser/.DS_Store deleted file mode 100644 index 200e887..0000000 Binary files a/Analyser/.DS_Store and /dev/null differ diff --git a/Analyser/Analyser.csproj b/Analyser/Analyser.csproj index 36b1af4..8a672da 100644 --- a/Analyser/Analyser.csproj +++ b/Analyser/Analyser.csproj @@ -1,15 +1,18 @@  netstandard2.0 - Lucene.JIEba.Analyzer - 1.0.0 - SilentCC + Lucene.JIEba.Analyzer.NetCore + 1.0.1 + shshshddy JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese false - https://github.com/SilentCC/JIEba-netcore2.0/ - Copyright 2019 (c) AgileLabs. All rights reserved. + https://github.com/shshshdy/JIEba-netcore + Copyright 2020 (c) AgileLabs. All rights reserved. Analyzer Segment JIEba.net core2.0 true + shshshddy + 1.0.1 + 删除多余依赖 @@ -24,10 +27,6 @@ - - - - diff --git a/README.md b/README.md index 08bf1b8..b78b0df 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ ps: 修改了JIEba分词,导致的高亮bug TokenStream = analyzer.GetTokenStream(str,indexReader); ``` +>[具体demo参考](https://gitee.com/shshshdy/net-core-tool) - # 相关文档 -[JIEba.net 集成到Lucene.Net的过程](https://www.cnblogs.com/dacc123/p/8431369.html) +# 感谢原作 + +在[原作](https://github.com/SilentCC/JIEba-netcore2.0/)的基础上,修改了加载用户分词的BUG. diff --git a/Segmenter/.DS_Store b/Segmenter/.DS_Store deleted file mode 100644 index 25dc68f..0000000 Binary files a/Segmenter/.DS_Store and /dev/null differ diff --git a/Segmenter/Common/FileExtension.cs b/Segmenter/Common/FileExtension.cs index 1dd0157..136e414 100755 --- a/Segmenter/Common/FileExtension.cs +++ b/Segmenter/Common/FileExtension.cs @@ -26,10 +26,24 @@ public static string ReadEmbeddedAllLine(string path,Encoding encoding) public static List ReadEmbeddedAllLines(string path, Encoding encoding) { - var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly); - var fileInfo = provider.GetFileInfo(path); - List list = new List(); - using (StreamReader streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding)) + var assmbly = typeof(FileExtension).GetTypeInfo().Assembly; + return ReadEmbeddedAllLines(assmbly, path, encoding); + } + + public static List ReadEmbeddedAllLines(string path) + { + return ReadEmbeddedAllLines(path, Encoding.UTF8); + } + + public static List ReadAllLines(string path) + { + return ReadAllLines(path, Encoding.UTF8); + } + + public static List ReadAllLines(string path, Encoding encoding) + { + var list = new List(); + using (StreamReader streamReader = new StreamReader(path, encoding)) { string item; while ((item = streamReader.ReadLine()) != null) @@ -40,9 +54,25 @@ public static List ReadEmbeddedAllLines(string path, Encoding encoding) return list; } - public static List ReadEmbeddedAllLines(string path) + public static List ReadEmbeddedAllLines(Assembly assembly, string path) { - return ReadEmbeddedAllLines(path, Encoding.UTF8); + return ReadEmbeddedAllLines(assembly, path, Encoding.UTF8); + } + + public static List ReadEmbeddedAllLines(Assembly assembly, string path, Encoding encoding) + { + var provider = new EmbeddedFileProvider(assembly); + var fileInfo = provider.GetFileInfo(path); + List list = new List(); + using (StreamReader streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding)) + { + string item; + while ((item = streamReader.ReadLine()) != null) + { + list.Add(item); + } + } + return list; } } } diff --git a/Segmenter/JiebaSegmenter.cs b/Segmenter/JiebaSegmenter.cs index 93cf056..33f07ae 100755 --- a/Segmenter/JiebaSegmenter.cs +++ b/Segmenter/JiebaSegmenter.cs @@ -7,6 +7,7 @@ using JiebaNet.Segmenter.Common; using JiebaNet.Segmenter.FinalSeg; using System.IO; +using System.Reflection; namespace JiebaNet.Segmenter { @@ -76,7 +77,7 @@ public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true return CutIt(text, cutMethod, reHan, reSkip, cutAll); } - public IEnumerable Cut2(string text,bool cutAll=false,bool hmm=true) + public IEnumerable Cut2(string text, bool cutAll = false, bool hmm = true) { var reHan = RegexChineseDefault; var reSkip = RegexSkipDefault; @@ -185,7 +186,7 @@ public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMo result.Add(new Token(w.value, w.position, w.position + width)); - } + } } return result; @@ -369,44 +370,45 @@ internal IEnumerable CutIt2(string text, Func(); var blocks = reHan.Split(text); var start = 0; - foreach(var blk in blocks) + foreach (var blk in blocks) { - if(string.IsNullOrWhiteSpace(blk)) + if (string.IsNullOrWhiteSpace(blk)) { start += blk.Length; continue; } - if(reHan.IsMatch(blk)) + if (reHan.IsMatch(blk)) { - foreach(var word in cutMethod(blk)) + foreach (var word in cutMethod(blk)) { - result.Add(new WordInfo(word,start)); + result.Add(new WordInfo(word, start)); start += word.Length; } } else { var tmp = reSkip.Split(blk); - foreach(var x in tmp) + foreach (var x in tmp) { - if(reSkip.IsMatch(x)) + if (reSkip.IsMatch(x)) { - result.Add(new WordInfo(x,start)); + result.Add(new WordInfo(x, start)); start += x.Length; } - else if(!cutAll) + else if (!cutAll) { - foreach(var ch in x) + foreach (var ch in x) { - result.Add(new WordInfo(ch.ToString(),start)); + result.Add(new WordInfo(ch.ToString(), start)); start += ch.ToString().Length; } } - else{ - - result.Add(new WordInfo(x,start)); + else + { + + result.Add(new WordInfo(x, start)); start += x.Length; - + } } } @@ -469,6 +471,54 @@ internal IEnumerable CutIt(string text, Func /// Loads user dictionaries. /// /// + public void LoadUserDictForEmbedded(Assembly assembly, string userDictFile) + { + Debug.WriteLine("Initializing user dictionary: " + userDictFile); + + lock (locker) + { + if (LoadedPath.Contains(userDictFile)) + return; + + try + { + var startTime = DateTime.Now.Millisecond; + + var lines = FileExtension.ReadEmbeddedAllLines(assembly, userDictFile); + foreach (var line in lines) + { + if (string.IsNullOrWhiteSpace(line)) + { + continue; + } + + var tokens = RegexUserDict.Match(line.Trim()).Groups; + var word = tokens["word"].Value.Trim(); + var freq = tokens["freq"].Value.Trim(); + var tag = tokens["tag"].Value.Trim(); + + var actualFreq = freq.Length > 0 ? int.Parse(freq) : 0; + AddWord(word, actualFreq, tag); + } + + Debug.WriteLine("user dict '{0}' load finished, time elapsed {1} ms", + userDictFile, DateTime.Now.Millisecond - startTime); + } + catch (IOException e) + { + Debug.Fail(string.Format("'{0}' load failure, reason: {1}", assembly.FullName.Split(',')[0] + "." + userDictFile, e.Message)); + } + catch (FormatException fe) + { + Debug.Fail(fe.Message); + } + } + } + + /// + /// Loads user dictionaries. + /// + /// public void LoadUserDict(string userDictFile) { var dictFullPath = Path.GetFullPath(userDictFile); @@ -483,7 +533,7 @@ public void LoadUserDict(string userDictFile) { var startTime = DateTime.Now.Millisecond; - var lines = FileExtension.ReadEmbeddedAllLines(dictFullPath); + var lines = FileExtension.ReadAllLines(dictFullPath); foreach (var line in lines) { if (string.IsNullOrWhiteSpace(line)) @@ -513,7 +563,6 @@ public void LoadUserDict(string userDictFile) } } } - public void AddWord(string word, int freq = 0, string tag = null) { if (freq <= 0) @@ -567,5 +616,5 @@ public enum TokenizerMode Search } - + } diff --git a/Segmenter/Segmenter.csproj b/Segmenter/Segmenter.csproj index 1e72eb3..dfa152d 100644 --- a/Segmenter/Segmenter.csproj +++ b/Segmenter/Segmenter.csproj @@ -2,15 +2,16 @@ netstandard2.0 - Lucene.JIEba.Segment + Lucene.JIEba.Segment.NetCore 1.0.0 - SilentCC + shshshdy JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese false - https://github.com/SilentCC/JIEba-netcore2.0/ + https://github.com/shshshdy/JIEba-netcore Copyright 2019 (c) AgileLabs. All rights reserved. Analyzer Segment JIEba.net core2.0 true + shshshdy diff --git a/Test/SegmentTest.cs b/Test/SegmentTest.cs index c3d0e04..726ab24 100644 --- a/Test/SegmentTest.cs +++ b/Test/SegmentTest.cs @@ -6,6 +6,7 @@ using System.Linq; using jieba.NET; using Xunit; +using System.Reflection; namespace Test { @@ -15,8 +16,9 @@ public class SegmenterTest public void TestCut() { var segmenter = new JiebaSegmenter(); + segmenter.LoadUserDict(@"D:\lucene\dict.txt"); + segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(),"dict.txt"); var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); - var resultWords = new List {"我", "来到", "北京", "清华", "清华大学", "华大", "大学"}; Compared(segments, resultWords); diff --git a/Test/Test.csproj b/Test/Test.csproj index 555765e..7407587 100644 --- a/Test/Test.csproj +++ b/Test/Test.csproj @@ -5,6 +5,14 @@ false + + + + + + + + diff --git a/Test/dict.txt b/Test/dict.txt new file mode 100644 index 0000000..9b8806d --- /dev/null +++ b/Test/dict.txt @@ -0,0 +1,2 @@ +习近平 +习大大 \ No newline at end of file diff --git a/jieba.NET/.DS_Store b/jieba.NET/.DS_Store deleted file mode 100644 index 89de716..0000000 Binary files a/jieba.NET/.DS_Store and /dev/null differ diff --git a/jieba.NET/JieBaAnalyzer.cs b/jieba.NET/JieBaAnalyzer.cs index 50a7f04..d2e32c8 100644 --- a/jieba.NET/JieBaAnalyzer.cs +++ b/jieba.NET/JieBaAnalyzer.cs @@ -9,19 +9,19 @@ namespace jieba.NET { - public class JieBaAnalyzer - :Analyzer + public class JieBaAnalyzer : Analyzer { - public TokenizerMode mode; - public JieBaAnalyzer(TokenizerMode Mode) - :base() + TokenizerMode _mode; + bool _defaultUserDict; + public JieBaAnalyzer(TokenizerMode Mode, bool defaultUserDict = false) : base() { - this.mode = Mode; + _mode = Mode; + _defaultUserDict = defaultUserDict; } - protected override TokenStreamComponents CreateComponents(string filedName,TextReader reader) + protected override TokenStreamComponents CreateComponents(string filedName, TextReader reader) { - var tokenizer = new JieBaTokenizer(reader,mode); + var tokenizer = new JieBaTokenizer(reader, _mode, _defaultUserDict); var tokenstream = (TokenStream)new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, tokenizer); diff --git a/jieba.NET/JieBaTokenizer.cs b/jieba.NET/JieBaTokenizer.cs index 9530001..882a704 100644 --- a/jieba.NET/JieBaTokenizer.cs +++ b/jieba.NET/JieBaTokenizer.cs @@ -5,71 +5,74 @@ using System.IO; using System.Collections.Generic; using System.Reflection; -using Microsoft.Extensions.FileProviders; - +using JiebaNet.Segmenter.Common; +using System.Text.RegularExpressions; +using Token = JiebaNet.Segmenter.Token; namespace jieba.NET { - public class JieBaTokenizer - : Tokenizer + public class JieBaTokenizer : Tokenizer { - private static bool _initial = false; private string _inputText; - private bool _originalResult = false; - private int _start = 0; - - private readonly string _stropWordsPath = "Resources/stopwords.txt"; + private readonly string _dictPath = "Resources/dict.txt"; private readonly JiebaSegmenter _segmenter; private TokenizerMode _mode; private ICharTermAttribute _termAtt; private IOffsetAttribute _offsetAtt; - private IPositionIncrementAttribute _posIncrAtt; + //private IPositionIncrementAttribute _posIncrAtt; private ITypeAttribute _typeAtt; + private readonly List _wordList = new List(); - private Dictionary _stopWords = new Dictionary(); - private List _wordList = new List(); + private IEnumerator _iter; - private IEnumerator _iter; + public List StopWords { get; } = new List(); - public JieBaTokenizer(TextReader input, TokenizerMode Mode) + /// + /// + /// + /// + /// + /// 致敬习大大用 + public JieBaTokenizer(TextReader input, TokenizerMode Mode, bool defaultUserDict = false) : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { + _segmenter = new JiebaSegmenter(); _mode = Mode; - LoadStopWords(); - Init(); - } - - public Dictionary StopWords - { - get => _stopWords; - } - - private void LoadStopWords() - { - var fileProvider = new EmbeddedFileProvider(GetType().GetTypeInfo().Assembly); - var fileInfo = fileProvider.GetFileInfo(_stropWordsPath); + if (defaultUserDict) + { + _segmenter.LoadUserDictForEmbedded(Assembly.GetCallingAssembly(), _dictPath); + } - using (var reader = new StreamReader(fileInfo.CreateReadStream())) + if (!string.IsNullOrEmpty(Settings.IgnoreDictFile)) { - var s = ""; - while ((s = reader.ReadLine()) != null) + var list = FileExtension.ReadAllLines(Settings.IgnoreDictFile); + foreach (var item in list) { - if (String.IsNullOrEmpty(s)) + if (string.IsNullOrEmpty(item)) continue; - if (_stopWords.ContainsKey(s)) + if (StopWords.Contains(item)) continue; - _stopWords.Add(s, 1); + StopWords.Add(item); } } + + if (!string.IsNullOrEmpty(Settings.UserDictFile)) + { + _segmenter.LoadUserDict(Settings.UserDictFile); + } + + Init(); } + #region private func private void Init() { _termAtt = AddAttribute(); _offsetAtt = AddAttribute(); - _posIncrAtt = AddAttribute(); + //_posIncrAtt = AddAttribute(); _typeAtt = AddAttribute(); + AddAttribute(); } private string ReadToEnd(TextReader input) @@ -77,7 +80,30 @@ private string ReadToEnd(TextReader input) return input.ReadToEnd(); } - public sealed override Boolean IncrementToken() + + private Lucene.Net.Analysis.Token Next() + { + var res = _iter.MoveNext(); + if (res) + { + var word = _iter.Current; + var token = new Lucene.Net.Analysis.Token(word.Word, word.StartIndex, word.EndIndex); + if (Settings.Log) + { + //chinese char + var zh = new Regex(@"[\u4e00-\u9fa5]|[^\x00-\xff]"); + var offset = zh.Matches(word.Word).Count; + var len = 10; + offset = offset > len ? 0 : offset; + Console.WriteLine($"==分词:{ word.Word.PadRight(len - offset, '=') }==起始位置:{ word.StartIndex.ToString().PadLeft(3, '=') }==结束位置{ word.EndIndex.ToString().PadLeft(3, '=') }"); + } + return token; + } + return null; + } + #endregion + + public sealed override bool IncrementToken() { ClearAttributes(); @@ -96,19 +122,6 @@ public sealed override Boolean IncrementToken() return false; } - private Lucene.Net.Analysis.Token Next() - { - var length = 0; - var res = _iter.MoveNext(); - if (res) - { - var word = _iter.Current; - var token = new Lucene.Net.Analysis.Token(word.Word, word.StartIndex, word.EndIndex); - _start += length; - return token; - } - return null; - } public override void Reset() { @@ -117,17 +130,16 @@ public override void Reset() _inputText = ReadToEnd(base.m_input); RemoveStopWords(_segmenter.Tokenize(_inputText, _mode)); - _start = 0; _iter = _wordList.GetEnumerator(); } - private void RemoveStopWords(IEnumerable words) + private void RemoveStopWords(IEnumerable words) { _wordList.Clear(); foreach (var x in words) { - if (!_stopWords.ContainsKey(x.Word)) + if (!StopWords.Contains(x.Word)) { _wordList.Add(x); } diff --git a/jieba.NET/Resources/dict.txt b/jieba.NET/Resources/dict.txt new file mode 100644 index 0000000..9e8a006 --- /dev/null +++ b/jieba.NET/Resources/dict.txt @@ -0,0 +1,7 @@ +毛泽东 +周恩来 +邓小平 +江泽民 +胡锦涛 +习近平 +习大大 \ No newline at end of file diff --git a/jieba.NET/Resources/stopwords.txt b/jieba.NET/Resources/stopwords.txt deleted file mode 100755 index 844b947..0000000 --- a/jieba.NET/Resources/stopwords.txt +++ /dev/null @@ -1,654 +0,0 @@ -i -me -my -myself -we -our -ours -ourselves -you -your -yours -yourself -yourselves -he -him -his -himself -she -her -hers -herself -it -its -itself -they -them -their -theirs -themselves -what -which -who -whom -this -that -these -those -am -is -are -was -were -be -been -being -have -has -had -having -do -does -did -doing -a -an -the -and -but -if -or -because -as -until -while -of -at -by -for -with -about -against -between -into -through -during -before -after -above -below -to -from -up -down -in -out -on -off -over -under -again -further -then -once -here -there -when -where -why -how -all -any -both -each -few -more -most -other -some -such -no -nor -not -only -own -same -so -than -too -very -s -t -can -will -just -don -should -now -一番 -一直 -一个 -一些 -许多 -种 -有的是 -也就是说 -阿 -哎呀 -哎哟 -俺 -俺们 -按 -按照 -吧 -吧哒 -把 -罢了 -被 -本 -本着 -比 -比方 -比如 -鄙人 -彼 -彼此 -边 -别 -别的 -别说 -并 -并且 -不比 -不成 -不单 -不但 -不独 -不管 -不光 -不过 -不仅 -不拘 -不论 -不怕 -不然 -不如 -不特 -不惟 -不问 -不只 -朝 -朝着 -趁 -趁着 -乘 -冲 -除 -除此之外 -除非 -除了 -此 -此间 -此外 -从 -从而 -打 -待 -但 -但是 -当 -当着 -到 -得 -的 -的话 -等 -等等 -地 -第 -叮咚 -对 -对于 -多 -多少 -而 -而况 -而且 -而是 -而外 -而言 -而已 -尔后 -反过来 -反过来说 -反之 -非但 -非徒 -否则 -嘎 -嘎登 -该 -赶 -个 -各 -各个 -各位 -各种 -各自 -给 -根据 -跟 -故 -故此 -固然 -关于 -管 -归 -果然 -果真 -过 -和 -何 -何处 -何况 -何时 -嘿 -哼 -哼唷 -呼哧 -乎 -哗 -还是 -还有 -换句话说 -换言之 -或 -或是 -或者 -极了 -及 -及其 -及至 -即 -即便 -即或 -即令 -即若 -即使 -几 -几时 -己 -既 -既然 -既是 -继而 -加之 -假如 -假若 -假使 -鉴于 -将 -较 -较之 -叫 -接着 -结果 -借 -紧接着 -进而 -尽 -尽管 -经 -经过 -就 -就是 -就是说 -据 -具体地说 -具体说来 -开始 -开外 -靠 -咳 -可 -可见 -可是 -可以 -况且 -啦 -来 -来着 -离 -例如 -哩 -连 -连同 -两者 -了 -临 -另 -另外 -另一方面 -论 -嘛 -吗 -慢说 -漫说 -冒 -么 -每 -每当 -们 -莫若 -某 -某个 -某些 -拿 -哪 -哪边 -哪儿 -哪个 -哪里 -哪年 -哪怕 -哪天 -哪些 -哪样 -那 -那边 -那儿 -那个 -那会儿 -那里 -那么 -那么些 -那么样 -那时 -那些 -那样 -乃 -乃至 -呢 -能 -你 -你们 -您 -宁 -宁可 -宁肯 -宁愿 -哦 -啪达 -旁人 -凭 -凭借 -其 -其次 -其二 -其他 -其它 -其一 -其余 -其中 -起 -起见 -起见 -岂但 -恰恰相反 -前后 -前者 -且 -然而 -然后 -然则 -让 -人家 -任 -任何 -任凭 -如 -如此 -如果 -如何 -如其 -如若 -如上所述 -若 -若非 -若是 -啥 -上下 -尚且 -设若 -设使 -甚而 -甚么 -甚至 -省得 -时候 -什么 -什么样 -使得 -是 -是的 -首先 -谁 -顺 -顺着 -似的 -虽 -虽然 -虽说 -虽则 -随 -随着 -所 -所以 -他 -他们 -他人 -它 -它们 -她 -她们 -倘 -倘或 -倘然 -倘若 -倘使 -腾 -替 -通过 -同 -同时 -哇 -万一 -往 -望 -为 -为何 -为了 -为什么 -为着 -喂 -嗡嗡 -我 -我们 -呜 -呜呼 -乌乎 -无论 -无宁 -毋宁 -嘻 -吓 -相对而言 -像 -向 -向着 -嘘 -焉 -沿 -沿着 -要 -要不 -要不然 -要不是 -要么 -要是 -也 -也罢 -也好 -一 -一旦 -一方面 -一来 -一切 -一样 -一则 -依 -依照 -矣 -以 -以便 -以及 -以免 -以至 -以至于 -以致 -抑或 -因 -因此 -因而 -因为 -用 -由 -由此可见 -由于 -有 -有的 -有关 -有些 -又 -于 -于是 -于是乎 -与 -与此同时 -与否 -与其 -越是 -云云 -哉 -再说 -再者 -在 -在下 -咱 -咱们 -则 -怎 -怎么办 -怎么样 -咋 -照 -照着 -者 -这 -这边 -这儿 -这个 -这会儿 -这就是说 -这里 -这么 -这么点儿 -这么些 -这么样 -这时 -这些 -这样 -正如 -吱 -之 -之类 -之所以 -之一 -只是 -只限 -只要 -只有 -至 -至于 -诸位 -着 -着呢 -自 -自从 -自个儿 -自各儿 -自己 -自家 -自身 -综上所述 -总的来看 -总的来说 -总的说来 -总而言之 -总之 -纵 -纵令 -纵然 -纵使 -遵照 -作为 -兮 -呗 -咚 -咦 -喏 -啐 -喔唷 -嗬 -嗯 -嗳 -。 -, -: -; -、 -“ -” -【 -】 -《 -》 -( -) -— -… -. -, -: -; -" -" -[ -] -< -> -( -) -@ -# -* -& -% -¥ -$ -- -+ -= -| -\ - \ No newline at end of file diff --git a/jieba.NET/Settings.cs b/jieba.NET/Settings.cs new file mode 100644 index 0000000..5644aea --- /dev/null +++ b/jieba.NET/Settings.cs @@ -0,0 +1,26 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace jieba.NET +{ + /// + /// JieBaAnalyzer 实例化之前使用 + /// + public static class Settings + { + /// + /// show log + /// + public static bool Log { get; set; } = false; + + /// + /// 忽略词典,每行一词 + /// + public static string IgnoreDictFile { get; set; } + /// + ///自定义词典,每行一词 + /// + public static string UserDictFile { get; set; } + } +} diff --git a/jieba.NET/jieba.NET.csproj b/jieba.NET/jieba.NET.csproj index 9712f18..c7f62f3 100644 --- a/jieba.NET/jieba.NET.csproj +++ b/jieba.NET/jieba.NET.csproj @@ -2,30 +2,32 @@ netstandard2.0 - Lucene.JIEba.net - 1.1.1 - SilentCC + Lucene.JIEba.NetCore + 1.1.4 + shshshdy JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese false - https://github.com/SilentCC/JIEba-netcore2.0/ - Copyright 2019 (c) AgileLabs. All rights reserved. + https://github.com/shshshdy/JIEba-netcore + Copyright 2020 (c) AgileLabs. All rights reserved. Analyzer Segment JIEba.net core2.0 true + shshshdy + 1.1.4 + 删除多余依赖 - - - - - + - + + + +