Skip to content

Commit e720fb8

Browse files
authored
Move SCIP utilities from sourcegraph/sourcegraph (#138)
1 parent f3c33b4 commit e720fb8

File tree

5 files changed

+659
-0
lines changed

5 files changed

+659
-0
lines changed

bindings/go/scip/canonicalize.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package scip
2+
3+
// CanonicalizeDocument deterministically re-orders the fields of the given document.
4+
func CanonicalizeDocument(document *Document) *Document {
5+
document.Occurrences = CanonicalizeOccurrences(document.Occurrences)
6+
document.Symbols = CanonicalizeSymbols(document.Symbols)
7+
return SanitizeDocument(document)
8+
}
9+
10+
// CanonicalizeOccurrences deterministically re-orders the fields of the given occurrence slice.
11+
func CanonicalizeOccurrences(occurrences []*Occurrence) []*Occurrence {
12+
canonicalized := make([]*Occurrence, 0, len(occurrences))
13+
for _, occurrence := range FlattenOccurrences(occurrences) {
14+
canonicalized = append(canonicalized, CanonicalizeOccurrence(occurrence))
15+
}
16+
17+
return SortOccurrences(canonicalized)
18+
}
19+
20+
// CanonicalizeOccurrence deterministically re-orders the fields of the given occurrence.
21+
func CanonicalizeOccurrence(occurrence *Occurrence) *Occurrence {
22+
// Express ranges as three-components if possible
23+
occurrence.Range = NewRange(occurrence.Range).SCIPRange()
24+
occurrence.Diagnostics = CanonicalizeDiagnostics(occurrence.Diagnostics)
25+
return occurrence
26+
}
27+
28+
// CanonicalizeDiagnostics deterministically re-orders the fields of the given diagnostic slice.
29+
func CanonicalizeDiagnostics(diagnostics []*Diagnostic) []*Diagnostic {
30+
canonicalized := make([]*Diagnostic, 0, len(diagnostics))
31+
for _, diagnostic := range diagnostics {
32+
canonicalized = append(canonicalized, CanonicalizeDiagnostic(diagnostic))
33+
}
34+
35+
return SortDiagnostics(canonicalized)
36+
}
37+
38+
// CanonicalizeDiagnostic deterministically re-orders the fields of the given diagnostic.
39+
func CanonicalizeDiagnostic(diagnostic *Diagnostic) *Diagnostic {
40+
diagnostic.Tags = SortDiagnosticTags(diagnostic.Tags)
41+
return diagnostic
42+
}
43+
44+
// CanonicalizeSymbols deterministically re-orders the fields of the given symbols slice.
45+
func CanonicalizeSymbols(symbols []*SymbolInformation) []*SymbolInformation {
46+
canonicalized := make([]*SymbolInformation, 0, len(symbols))
47+
for _, symbol := range FlattenSymbols(symbols) {
48+
canonicalized = append(canonicalized, CanonicalizeSymbol(symbol))
49+
}
50+
51+
return SortSymbols(canonicalized)
52+
}
53+
54+
// CanonicalizeSymbol deterministically re-orders the fields of the given symbol.
55+
func CanonicalizeSymbol(symbol *SymbolInformation) *SymbolInformation {
56+
symbol.Relationships = CanonicalizeRelationships(symbol.Relationships)
57+
return symbol
58+
}
59+
60+
// CanonicalizeRelationships deterministically re-orders the fields of the given relationship slice.
61+
func CanonicalizeRelationships(relationships []*Relationship) []*Relationship {
62+
return SortRelationships(FlattenRelationship(relationships))
63+
}

bindings/go/scip/flatten.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
package scip
2+
3+
// FlattenDocuments merges elements of the given slice with the same relative path. This allows us to make
4+
// the assumption post-canonicalization that each index has one representation of a given document path in
5+
// the database. This function returns a new slice.
6+
func FlattenDocuments(documents []*Document) []*Document {
7+
documentMap := make(map[string]*Document, len(documents))
8+
for _, document := range documents {
9+
existing, ok := documentMap[document.RelativePath]
10+
if !ok {
11+
documentMap[document.RelativePath] = document
12+
continue
13+
}
14+
if existing.Language != document.Language {
15+
_ = 0 // TODO - warn?
16+
}
17+
18+
existing.Symbols = append(existing.Symbols, document.Symbols...)
19+
existing.Occurrences = append(existing.Occurrences, document.Occurrences...)
20+
}
21+
22+
flattened := make([]*Document, 0, len(documentMap))
23+
for _, document := range documentMap {
24+
flattened = append(flattened, document)
25+
}
26+
27+
return flattened
28+
}
29+
30+
// FlattenSymbol merges elements of the given slice with the same symbol name. This allows us to make the
31+
// assumption post-canonicalization that each index and document refer to one symbol metadata object uniquely.
32+
// This function returns a new slice.
33+
func FlattenSymbols(symbols []*SymbolInformation) []*SymbolInformation {
34+
symbolMap := make(map[string]*SymbolInformation, len(symbols))
35+
for _, symbol := range symbols {
36+
existing, ok := symbolMap[symbol.Symbol]
37+
if !ok {
38+
symbolMap[symbol.Symbol] = symbol
39+
continue
40+
}
41+
42+
existing.Documentation = combineDocumentation(existing.Documentation, symbol.Documentation)
43+
existing.Relationships = append(existing.Relationships, symbol.Relationships...)
44+
}
45+
46+
flattened := make([]*SymbolInformation, 0, len(symbolMap))
47+
for _, symbol := range symbolMap {
48+
flattened = append(flattened, symbol)
49+
}
50+
51+
return flattened
52+
}
53+
54+
// FlattenOccurrences merges elements of the given slice with equivalent bounds. This function returns a new slice.
55+
func FlattenOccurrences(occurrences []*Occurrence) []*Occurrence {
56+
if len(occurrences) == 0 {
57+
return occurrences
58+
}
59+
60+
_ = SortOccurrences(occurrences)
61+
flattened := make([]*Occurrence, 0, len(occurrences))
62+
flattened = append(flattened, occurrences[0])
63+
64+
for _, occurrence := range occurrences[1:] {
65+
top := flattened[len(flattened)-1]
66+
67+
if !rawRangesEqual(top.Range, occurrence.Range) {
68+
flattened = append(flattened, occurrence)
69+
continue
70+
}
71+
if top.Symbol != occurrence.Symbol {
72+
flattened = append(flattened, occurrence)
73+
continue
74+
}
75+
76+
if top.SyntaxKind == SyntaxKind_UnspecifiedSyntaxKind {
77+
// Take first valid syntax kind
78+
top.SyntaxKind = occurrence.SyntaxKind
79+
}
80+
81+
// Combine all other fields
82+
top.SymbolRoles |= occurrence.SymbolRoles
83+
top.OverrideDocumentation = append(top.OverrideDocumentation, occurrence.OverrideDocumentation...)
84+
top.Diagnostics = append(top.Diagnostics, occurrence.Diagnostics...)
85+
}
86+
87+
return flattened
88+
}
89+
90+
// FlattenRelationship merges elements of the given slice with equivalent symbol names. This function returns a new
91+
// slice.
92+
func FlattenRelationship(relationships []*Relationship) []*Relationship {
93+
relationshipMap := make(map[string][]*Relationship, len(relationships))
94+
for _, relationship := range relationships {
95+
relationshipMap[relationship.Symbol] = append(relationshipMap[relationship.Symbol], relationship)
96+
}
97+
98+
flattened := make([]*Relationship, 0, len(relationshipMap))
99+
for _, relationships := range relationshipMap {
100+
combined := relationships[0]
101+
for _, relationship := range relationships[1:] {
102+
combined.IsReference = combined.IsReference || relationship.IsReference
103+
combined.IsImplementation = combined.IsImplementation || relationship.IsImplementation
104+
combined.IsTypeDefinition = combined.IsTypeDefinition || relationship.IsTypeDefinition
105+
combined.IsDefinition = combined.IsDefinition || relationship.IsDefinition
106+
}
107+
108+
flattened = append(flattened, combined)
109+
}
110+
111+
return flattened
112+
}
113+
114+
// combineDocumentation merges documentation components from two separate symbol information objects.
115+
func combineDocumentation(existing, additional []string) []string {
116+
filtered := make([]string, 0, len(additional))
117+
for _, s := range additional {
118+
if !stringSliceContains(existing, s) {
119+
filtered = append(filtered, s)
120+
}
121+
}
122+
123+
return append(existing, filtered...)
124+
}
125+
126+
func stringSliceContains(slice []string, target string) bool {
127+
for _, candidate := range slice {
128+
if target == candidate {
129+
return true
130+
}
131+
}
132+
133+
return false
134+
}

bindings/go/scip/sanitize.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package scip
2+
3+
import (
4+
"unicode/utf8"
5+
)
6+
7+
// SanitizeDocument ensures that all strings in the given document are valid UTF-8.
8+
// This is a requirement for successful protobuf encoding.
9+
func SanitizeDocument(document *Document) *Document {
10+
document.Language = sanitizeString(document.Language)
11+
document.RelativePath = sanitizeString(document.RelativePath)
12+
document.Occurrences = SanitizeOccurrences(document.Occurrences)
13+
document.Symbols = SanitizeSymbols(document.Symbols)
14+
return document
15+
}
16+
17+
// SanitizeOccurrences ensures that all strings in the given occurrence slice are valid UTF-8.
18+
// The input slice is modified in-place but returned for convenience.
19+
// This is a requirement for successful protobuf encoding.
20+
func SanitizeOccurrences(occurrences []*Occurrence) []*Occurrence {
21+
for i, occurrence := range occurrences {
22+
occurrences[i] = SanitizeOccurrence(occurrence)
23+
}
24+
25+
return occurrences
26+
}
27+
28+
// SanitizeOccurrence ensures that all strings in the given occurrence are valid UTF-8.
29+
// This is a requirement for successful protobuf encoding.
30+
func SanitizeOccurrence(occurrence *Occurrence) *Occurrence {
31+
occurrence.Symbol = sanitizeString(occurrence.Symbol)
32+
occurrence.OverrideDocumentation = sanitizeStringSlice(occurrence.OverrideDocumentation)
33+
occurrence.Diagnostics = SanitizeDiagnostics(occurrence.Diagnostics)
34+
return occurrence
35+
}
36+
37+
// SanitizeDiagnostics ensures that all strings in the given diagnostic slice are valid UTF-8.
38+
// The input slice is modified in-place but returned for convenience.
39+
// This is a requirement for successful protobuf encoding.
40+
func SanitizeDiagnostics(diagnostics []*Diagnostic) []*Diagnostic {
41+
for i, diagnostic := range diagnostics {
42+
diagnostics[i] = SanitizeDiagnostic(diagnostic)
43+
}
44+
45+
return diagnostics
46+
}
47+
48+
// SanitizeDiagnostic ensures that all strings in the given diagnostic are valid UTF-8.
49+
// This is a requirement for successful protobuf encoding.
50+
func SanitizeDiagnostic(diagnostic *Diagnostic) *Diagnostic {
51+
diagnostic.Code = sanitizeString(diagnostic.Code)
52+
diagnostic.Message = sanitizeString(diagnostic.Message)
53+
diagnostic.Source = sanitizeString(diagnostic.Source)
54+
return diagnostic
55+
}
56+
57+
// SanitizeSymbols ensures that all strings in the given symbols slice are valid UTF-8.
58+
// The input slice is modified in-place but returned for convenience.
59+
// This is a requirement for successful protobuf encoding.
60+
func SanitizeSymbols(symbols []*SymbolInformation) []*SymbolInformation {
61+
for i, symbol := range symbols {
62+
symbols[i] = SanitizeSymbol(symbol)
63+
}
64+
65+
return symbols
66+
}
67+
68+
// SanitizeSymbol ensures that all strings in the given symbol are valid UTF-8.
69+
// This is a requirement for successful protobuf encoding.
70+
func SanitizeSymbol(symbol *SymbolInformation) *SymbolInformation {
71+
symbol.Symbol = sanitizeString(symbol.Symbol)
72+
symbol.Documentation = sanitizeStringSlice(symbol.Documentation)
73+
74+
for _, relationship := range symbol.Relationships {
75+
relationship.Symbol = sanitizeString(relationship.Symbol)
76+
}
77+
78+
return symbol
79+
}
80+
81+
// sanitizeStringSlice ensures the strings in the given slice are all valid UTF-8.
82+
// The input slice is modified in-place but returned for convenience.
83+
// This is a requirement for successful protobuf encoding.
84+
func sanitizeStringSlice(ss []string) []string {
85+
for i, s := range ss {
86+
ss[i] = sanitizeString(s)
87+
}
88+
89+
return ss
90+
}
91+
92+
// sanitizeString coerces a string into valid UTF-8 (if it's not already).
93+
func sanitizeString(s string) string {
94+
if utf8.ValidString(s) {
95+
return s
96+
}
97+
98+
// This seems redundant, but it isn't: it magically makes the string valid UTF-8
99+
return string([]rune(s))
100+
}

0 commit comments

Comments
 (0)