Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 65 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,51 +1,102 @@
# node-symspell

JavaScript port of SymSpell 6.6 based on the [original C# version by Wolf Garde](https://github.com/wolfgarbe/SymSpell) and the [Python version by mammothb](https://github.com/mammothb/symspellpy).



Just like the Python version, this cuts out some of the C# memory optimisation which aren't really relevant in JavaScript. As a result, this port is not optimised for speed, though it's still quite fast.



This version also includes the additions of the Python version such as the `ignoreToken` and `transferCasing` options. The unit tests provided are those of the Python version which are much more comprehensive than the original ones.



This library uses the `iter-tools` and `difflib` modules which are Javascript ports of the Python modules with similar names. Because it uses async/await and async generators, it needs at least Node 12.x.



**NOTE: this is still a work in progress and the API is likely to change**



## Basic Example



```js
const SymSpell = require('node-symspell')

const maxEditDistance = 2
const prefixLength = 7
const symSpell = new SymSpell(maxEditDistance, prefixLength)
await symSpell.loadDictionary(dictionaryPath, 0, 1)
await symSpell.loadBigramDictionary(bigramPath, 0, 2)
const SymSpell = require('node-symspell')
//import SymSpell, { Verbosity } from './index.js'
//node js compatible
//check `example.js` for more dtails

const maxEditDistance = 2

const typo = 'Can yu readthis messa ge despite thehorible sppelingmsitakes'
const results = symSpell.lookupCompound(typo, maxEditDistance)
const prefixLength = 7

const symSpell = new SymSpell(maxEditDistance, prefixLength)

await symSpell.loadDictionary(dictionaryPath, 0, 1)

await symSpell.loadBigramDictionary(bigramPath, 0, 2)



const typo = 'Can yu readthis messa ge despite thehorible sppelingmsitakes'

const results = symSpell.lookupCompound(typo, maxEditDistance)



console.log(results[0])

// {
// term: 'can you read this message despite the horrible spelling mistakes',
// distance: 10,
// count: 0
// }

// term: 'can you read this message despite the horrible spelling mistakes',

// distance: 10,

// count: 0

// }

```



## Main API overview



`constructor (maxDictionaryEditDistance = 2, prefixLength = 7, countThreshold = 1)`



`async loadDictionary (dictFile, termIndex, countIndex, separator = ' ')`



`async loadBigramDictionary (dictFile, termIndex, countIndex, separator = ' ')`



`lookup (input, verbosity, maxEditDistance = null, { includeUnknown, ignoreToken, transferCasing } = {})`



`lookupCompound (input, maxEditDistance = null, { ignoreNonWords, transferCasing } = {})`



`wordSegmentation (input, { maxEditDistance = null, maxSegmentationWordLength = null, ignoreToken } = {})`



## References

https://github.com/wolfgarbe/SymSpell
https://github.com/mammothb/symspellpy


https://github.com/wolfgarbe/SymSpell

https://github.com/mammothb/symspellpy
8 changes: 4 additions & 4 deletions edit-distance.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const Helpers = require('./helpers')
import { nullDistanceResults, prefixSuffixPrep } from './helpers.js'

/// <summary>
/// Class providing optimized methods for computing Damerau-Levenshtein Optimal String
Expand Down Expand Up @@ -52,7 +52,7 @@ class EditDistance {
/// difference between the strings increases.</returns>
distance (string1 = null, string2 = null, maxDistance) {
if (string1 === null || string2 === null) {
return Helpers.nullDistanceResults(string1, string2, maxDistance)
return nullDistanceResults(string1, string2, maxDistance)
}

if (maxDistance <= 0) {
Expand All @@ -74,7 +74,7 @@ class EditDistance {
}

// identify common suffix and/or prefix that can be ignored
const { len1, len2, start } = Helpers.prefixSuffixPrep(string1, string2)
const { len1, len2, start } = prefixSuffixPrep(string1, string2)

if (len1 === 0) {
return (len2 <= iMaxDistance) ? len2 : -1
Expand Down Expand Up @@ -224,4 +224,4 @@ class EditDistance {
}
}

module.exports = EditDistance
export default EditDistance
19 changes: 19 additions & 0 deletions example.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import SymSpell, { Verbosity } from './index.js'

const maxEditDistance = 2
const prefixLength = 7
const symSpell = new SymSpell(maxEditDistance, prefixLength)


const dictionaryPath = './dictionaries/frequency_dictionary_en_82_765.txt' // for spelling correction (genuine English words)
const bigramPath = './dictionaries/frequency_bigramdictionary_en_243_342.txt'



await symSpell.loadDictionary(dictionaryPath, 0, 1)
await symSpell.loadBigramDictionary(bigramPath, 0, 2)

const typo = 'Can yu readthis messa ge despite thehorible sppelingmsitakes'
const results = symSpell.lookupCompound(typo, maxEditDistance)

console.log(results[0])
64 changes: 52 additions & 12 deletions helpers.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
const difflib = require('difflib')
const itertools = require('iter-tools')
import { SequenceMatcher } from 'difflib'
import { zip,zipAll } from 'iter-tools'

const { zip, zipLongest } = itertools
//const { zip, zipAll } = itertools

const isAcronym = (word) => {
export const isAcronym = (word) => {
// """Checks is the word is all caps (acronym) and/or contain numbers
// Parameters
// ----------
Expand All @@ -20,7 +20,7 @@ const isAcronym = (word) => {
return word.match(/\b[A-Z0-9]{2,}\b/)
}

const parseWordsCase = (phrase, preserveCase) => {
export const parseWordsCase = (phrase, preserveCase) => {
// """Create a non-unique wordlist from sample text. Language
// independent (e.g. works with Chinese characters)
// Parameters
Expand All @@ -45,7 +45,7 @@ const parseWordsCase = (phrase, preserveCase) => {
return Array.from(phrase.matchAll(/([^\W_]+['’]*[^\W_]*)/g), (m) => m[0])
}

const transferCasingMatching = (textWithCasing, textWithoutCasing) => {
export const transferCasingMatching = (textWithCasing, textWithoutCasing) => {
// """Transferring the casing from one text to another - assuming that
// they are 'matching' texts, alias they have the same length.
// Parameters
Expand All @@ -70,7 +70,7 @@ const transferCasingMatching = (textWithCasing, textWithoutCasing) => {
}).join('')
}

const transferCasingSimilar = (textWithCasing, textWithoutCasing) => {
export const transferCasingSimilar = (textWithCasing, textWithoutCasing) => {
// Transferring the casing from one text to another - for similar (not matching) text
// 1. It will use `difflib`'s `SequenceMatcher` to identify the
// different type of changes needed to turn `textWithCasing` into
Expand Down Expand Up @@ -107,7 +107,7 @@ const transferCasingSimilar = (textWithCasing, textWithoutCasing) => {
// If `textWithCasing` is empty
// """

const _sm = new difflib.SequenceMatcher(null, textWithCasing.toLowerCase(), textWithoutCasing)
const _sm = new SequenceMatcher(null, textWithCasing.toLowerCase(), textWithoutCasing)

// we will collect the case_text:
let c = ''
Expand Down Expand Up @@ -167,7 +167,7 @@ const transferCasingSimilar = (textWithCasing, textWithoutCasing) => {
// sequence
let _last = 'lower'

for (const [w, wo] of zipLongest(_withCasing, _withoutCasing)) {
for (const [w, wo] of zipAll(_withCasing, _withoutCasing)) {
if (w && wo) {
if (w === w.toUpperCase()) {
c += wo.toUpperCase()
Expand Down Expand Up @@ -198,7 +198,7 @@ const transferCasingSimilar = (textWithCasing, textWithoutCasing) => {

/// <summary>Determines the proper return value of an edit distance function when one or
/// both strings are null.</summary>
const nullDistanceResults = (string1, string2, maxDistance) => {
export const nullDistanceResults = (string1, string2, maxDistance) => {
if (string1 === null) {
return string2 === null ? 0 : (string2.length <= maxDistance) ? string2.length : -1
}
Expand All @@ -209,7 +209,7 @@ const nullDistanceResults = (string1, string2, maxDistance) => {
/// <summary>Calculates starting position and lengths of two strings such that common
/// prefix and suffix substrings are excluded.</summary>
/// <remarks>Expects string1.length to be less than or equal to string2.length</remarks>
const prefixSuffixPrep = (string1, string2) => {
export const prefixSuffixPrep = (string1, string2) => {
let len2 = string2.length
let len1 = string1.length // this is also the minimun length of the two strings

Expand All @@ -233,7 +233,47 @@ const prefixSuffixPrep = (string1, string2) => {
return { len1, len2, start }
}

module.exports = {

// permutations.js
export function permutations(arr) {
const results = [];

if (arr.length === 0) return [];
if (arr.length === 1) return [arr];

for (let i = 0; i < arr.length; i++) {
const current = arr[i];
const remaining = arr.slice(0, i).concat(arr.slice(i + 1));
const remainingPerms = permutations(remaining);

for (let perm of remainingPerms) {
results.push([current, ...perm]);
}
}

return results;
}

// combinations.js
export function combinations(arr, length) {
const results = [];

function combine(start, combo) {
if (combo.length === length) {
results.push(combo);
return;
}

for (let i = start; i < arr.length; i++) {
combine(i + 1, combo.concat(arr[i]));
}
}

combine(0, []);
return results;
}

export default {
isAcronym,
parseWordsCase,
transferCasingMatching,
Expand Down
Loading