Skip to content

Commit 15d50fc

Browse files
committed
feat(core): Add a new string index: shingles
1 parent 1d81562 commit 15d50fc

File tree

7 files changed

+78
-7
lines changed

7 files changed

+78
-7
lines changed

dgraph/cmd/alpha/dashboard.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ func keywordHandler(w http.ResponseWriter, r *http.Request) {
5252
"@normalize",
5353
"after",
5454
"allofterms",
55+
"shingles",
5556
"alloftext",
5657
"and",
5758
"anyofterms",

dql/parser.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1700,7 +1700,7 @@ func validFuncName(name string) bool {
17001700
}
17011701

17021702
switch name {
1703-
case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext",
1703+
case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext", "shingles",
17041704
"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":
17051705
return true
17061706
}

query/query.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,7 +1007,7 @@ func calculatePaginationParams(sg *SubGraph) (int32, int32) {
10071007
shouldExclude := false
10081008
if sg.SrcFunc != nil {
10091009
switch sg.SrcFunc.Name {
1010-
case "regexp", "alloftext", "allofterms", "match":
1010+
case "regexp", "alloftext", "allofterms", "match", "shingles":
10111011
shouldExclude = true
10121012
default:
10131013
shouldExclude = false
@@ -2746,7 +2746,7 @@ func isValidArg(a string) bool {
27462746
// isValidFuncName checks if fn passed is valid keyword.
27472747
func isValidFuncName(f string) bool {
27482748
switch f {
2749-
case "anyofterms", "allofterms", "val", "regexp", "anyoftext", "alloftext",
2749+
case "anyofterms", "allofterms", "val", "regexp", "anyoftext", "alloftext", "shingles",
27502750
"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":
27512751
return true
27522752
}

tok/tok.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ const (
4848
IdentSha = 0xC
4949
IdentBigFloat = 0xD
5050
IdentVFloat = 0xE
51+
IdentShingles = 0xF
5152
IdentCustom = 0x80
5253
IdentDelimiter = 0x1f // ASCII 31 - Unit separator
5354
)
@@ -98,6 +99,7 @@ func init() {
9899
registerTokenizer(HashTokenizer{})
99100
registerTokenizer(TermTokenizer{})
100101
registerTokenizer(FullTextTokenizer{})
102+
registerTokenizer(ShinglesTokenizer{})
101103
registerTokenizer(Sha256Tokenizer{})
102104
setupBleve()
103105
}
@@ -424,6 +426,57 @@ func (t ExactTokenizer) Prefix() []byte {
424426
return prefix
425427
}
426428

429+
type ShinglesTokenizer struct {
430+
lang string
431+
}
432+
433+
func (t ShinglesTokenizer) Name() string { return "shingles" }
434+
func (t ShinglesTokenizer) Type() string { return "string" }
435+
func (t ShinglesTokenizer) Tokens(v interface{}) ([]string, error) {
436+
str, ok := v.(string)
437+
if !ok || str == "" {
438+
return []string{}, nil
439+
}
440+
lang := LangBase(t.lang)
441+
442+
// Step 1: Lowercase, normalize, basic tokenization
443+
tokens := fulltextAnalyzer.Analyze([]byte(str))
444+
445+
// Step 2: Remove stopwords
446+
tokens = filterStopwords(lang, tokens)
447+
448+
// Step 3: Apply stemming
449+
tokens = filterStemmers(lang, tokens)
450+
451+
// Step 4: Generate shingles (bigrams and trigrams)
452+
shingled := make([]string, 0, len(tokens))
453+
n := len(tokens)
454+
455+
for i := 0; i < n; i++ {
456+
// unigram
457+
shingled = append(shingled, string(tokens[i].Term))
458+
459+
// bigram
460+
if i+1 < n {
461+
shingled = append(shingled, string(tokens[i].Term)+" "+string(tokens[i+1].Term))
462+
}
463+
// trigram
464+
if i+2 < n {
465+
shingled = append(shingled, string(tokens[i].Term)+" "+string(tokens[i+1].Term)+" "+string(tokens[i+2].Term))
466+
}
467+
468+
if i+3 < n {
469+
shingled = append(shingled, string(tokens[i].Term)+" "+string(tokens[i+1].Term)+" "+string(tokens[i+2].Term)+" "+string(tokens[i+3].Term))
470+
}
471+
}
472+
473+
// Step 5: Deduplicate
474+
return x.RemoveDuplicates(shingled), nil
475+
}
476+
func (t ShinglesTokenizer) Identifier() byte { return IdentShingles }
477+
func (t ShinglesTokenizer) IsSortable() bool { return false }
478+
func (t ShinglesTokenizer) IsLossy() bool { return true }
479+
427480
// FullTextTokenizer generates full-text tokens from string data.
428481
type FullTextTokenizer struct{ lang string }
429482

tok/tokens.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ func GetTermTokens(funcArgs []string) ([]string, error) {
5959
return GetTokens(IdentTerm, funcArgs...)
6060
}
6161

62+
func GetShinglesTokens(funcArgs []string, lang string) ([]string, error) {
63+
return BuildTokens(funcArgs[0], ShinglesTokenizer{lang: lang})
64+
}
65+
6266
// GetFullTextTokens returns the full-text tokens for the given value.
6367
func GetFullTextTokens(funcArgs []string, lang string) ([]string, error) {
6468
if l := len(funcArgs); l != 1 {

worker/task.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ const (
210210
geoFn
211211
passwordFn
212212
regexFn
213+
shinglesFn
213214
fullTextSearchFn
214215
hasFn
215216
uidInFn
@@ -248,6 +249,8 @@ func parseFuncTypeHelper(name string) (FuncType, string) {
248249
return passwordFn, f
249250
case "regexp":
250251
return regexFn, f
252+
case "shingles":
253+
return shinglesFn, f
251254
case "alloftext", "anyoftext":
252255
return fullTextSearchFn, f
253256
case "has":
@@ -311,7 +314,7 @@ func (srcFn *functionContext) needsValuePostings(typ types.TypeID) (bool, error)
311314
return false, nil
312315
}
313316
return true, nil
314-
case geoFn, regexFn, fullTextSearchFn, standardFn, hasFn, customIndexFn, matchFn:
317+
case geoFn, regexFn, fullTextSearchFn, standardFn, hasFn, customIndexFn, matchFn, shinglesFn:
315318
// All of these require an index, hence would require fetching uid postings.
316319
return false, nil
317320
case uidInFn, compareScalarFn:
@@ -819,7 +822,7 @@ func (qs *queryState) handleUidPostings(
819822
} else {
820823
key = x.DataKey(q.Attr, q.UidList.Uids[i])
821824
}
822-
case geoFn, regexFn, fullTextSearchFn, standardFn, customIndexFn, matchFn,
825+
case geoFn, regexFn, fullTextSearchFn, standardFn, customIndexFn, matchFn, shinglesFn,
823826
compareAttrFn:
824827
key = x.IndexKey(q.Attr, srcFn.tokens[i])
825828
default:
@@ -1189,7 +1192,7 @@ func needsStringFiltering(srcFn *functionContext, langs []string, attr string) b
11891192
return langForFunc(langs) != "." &&
11901193
(srcFn.fnType == standardFn || srcFn.fnType == hasFn ||
11911194
srcFn.fnType == fullTextSearchFn || srcFn.fnType == compareAttrFn ||
1192-
srcFn.fnType == customIndexFn)
1195+
srcFn.fnType == customIndexFn || srcFn.fnType == shinglesFn)
11931196
}
11941197

11951198
func (qs *queryState) handleCompareScalarFunction(ctx context.Context, arg funcArgs) error {
@@ -1704,6 +1707,11 @@ func (qs *queryState) filterStringFunction(arg funcArgs) error {
17041707
case hasFn:
17051708
// Dont do anything, as filtering based on lang is already
17061709
// done above.
1710+
case shinglesFn:
1711+
filter.tokens = arg.srcFn.tokens
1712+
filter.match = defaultMatch
1713+
filter.tokName = "shingles"
1714+
filtered = matchStrings(filtered, values, &filter)
17071715
case fullTextSearchFn:
17081716
filter.tokens = arg.srcFn.tokens
17091717
filter.match = defaultMatch
@@ -2024,7 +2032,7 @@ func parseSrcFn(ctx context.Context, q *pb.Query) (*functionContext, error) {
20242032
return nil, err
20252033
}
20262034
fc.n = len(q.UidList.Uids)
2027-
case standardFn, fullTextSearchFn:
2035+
case standardFn, fullTextSearchFn, shinglesFn:
20282036
// srcfunc 0th val is func name and [2:] are args.
20292037
// we tokenize the arguments of the query.
20302038
if err = ensureArgsCount(q.SrcFunc, 1); err != nil {

worker/tokens.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import (
2121
func verifyStringIndex(ctx context.Context, attr string, funcType FuncType) (string, bool) {
2222
var requiredTokenizer tok.Tokenizer
2323
switch funcType {
24+
case shinglesFn:
25+
requiredTokenizer = tok.ShinglesTokenizer{}
2426
case fullTextSearchFn:
2527
requiredTokenizer = tok.FullTextTokenizer{}
2628
case matchFn:
@@ -63,6 +65,9 @@ func getStringTokens(funcArgs []string, lang string, funcType FuncType) ([]strin
6365
if funcType == fullTextSearchFn {
6466
return tok.GetFullTextTokens(funcArgs, lang)
6567
}
68+
if funcType == shinglesFn {
69+
return tok.GetShinglesTokens(funcArgs, lang)
70+
}
6671
return tok.GetTermTokens(funcArgs)
6772
}
6873

0 commit comments

Comments
 (0)