sourcegraph
diff --git a/‎bindings/go/scip/canonicalize.go
Lines changed: 63 additions & 0 deletions b/‎bindings/go/scip/canonicalize.go
Lines changed: 63 additions & 0 deletions
diff --git a/‎bindings/go/scip/flatten.go
Lines changed: 134 additions & 0 deletions b/‎bindings/go/scip/flatten.go
Lines changed: 134 additions & 0 deletions
diff --git a/‎bindings/go/scip/sanitize.go
Lines changed: 100 additions & 0 deletions b/‎bindings/go/scip/sanitize.go
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1,63 @@
+package scip
+
+// CanonicalizeDocument deterministically re-orders the fields of the given document.
+func CanonicalizeDocument(document *Document) *Document {
+	document.Occurrences = CanonicalizeOccurrences(document.Occurrences)
+	document.Symbols = CanonicalizeSymbols(document.Symbols)
+	return SanitizeDocument(document)
+}
+
+// CanonicalizeOccurrences deterministically re-orders the fields of the given occurrence slice.
+func CanonicalizeOccurrences(occurrences []*Occurrence) []*Occurrence {
+	canonicalized := make([]*Occurrence, 0, len(occurrences))
+	for _, occurrence := range FlattenOccurrences(occurrences) {
+		canonicalized = append(canonicalized, CanonicalizeOccurrence(occurrence))
+	}
+
+	return SortOccurrences(canonicalized)
+}
+
+// CanonicalizeOccurrence deterministically re-orders the fields of the given occurrence.
+func CanonicalizeOccurrence(occurrence *Occurrence) *Occurrence {
+	// Express ranges as three-components if possible
+	occurrence.Range = NewRange(occurrence.Range).SCIPRange()
+	occurrence.Diagnostics = CanonicalizeDiagnostics(occurrence.Diagnostics)
+	return occurrence
+}
+
+// CanonicalizeDiagnostics deterministically re-orders the fields of the given diagnostic slice.
+func CanonicalizeDiagnostics(diagnostics []*Diagnostic) []*Diagnostic {
+	canonicalized := make([]*Diagnostic, 0, len(diagnostics))
+	for _, diagnostic := range diagnostics {
+		canonicalized = append(canonicalized, CanonicalizeDiagnostic(diagnostic))
+	}
+
+	return SortDiagnostics(canonicalized)
+}
+
+// CanonicalizeDiagnostic deterministically re-orders the fields of the given diagnostic.
+func CanonicalizeDiagnostic(diagnostic *Diagnostic) *Diagnostic {
+	diagnostic.Tags = SortDiagnosticTags(diagnostic.Tags)
+	return diagnostic
+}
+
+// CanonicalizeSymbols deterministically re-orders the fields of the given symbols slice.
+func CanonicalizeSymbols(symbols []*SymbolInformation) []*SymbolInformation {
+	canonicalized := make([]*SymbolInformation, 0, len(symbols))
+	for _, symbol := range FlattenSymbols(symbols) {
+		canonicalized = append(canonicalized, CanonicalizeSymbol(symbol))
+	}
+
+	return SortSymbols(canonicalized)
+}
+
+// CanonicalizeSymbol deterministically re-orders the fields of the given symbol.
+func CanonicalizeSymbol(symbol *SymbolInformation) *SymbolInformation {
+	symbol.Relationships = CanonicalizeRelationships(symbol.Relationships)
+	return symbol
+}
+
+// CanonicalizeRelationships deterministically re-orders the fields of the given relationship slice.
+func CanonicalizeRelationships(relationships []*Relationship) []*Relationship {
+	return SortRelationships(FlattenRelationship(relationships))
+}
@@ -0,0 +1,134 @@
+package scip
+
+// FlattenDocuments merges elements of the given slice with the same relative path. This allows us to make
+// the assumption post-canonicalization that each index has one representation of a given document path in
+// the database. This function returns a new slice.
+func FlattenDocuments(documents []*Document) []*Document {
+	documentMap := make(map[string]*Document, len(documents))
+	for _, document := range documents {
+		existing, ok := documentMap[document.RelativePath]
+		if !ok {
+			documentMap[document.RelativePath] = document
+			continue
+		}
+		if existing.Language != document.Language {
+			_ = 0 // TODO - warn?
+		}
+
+		existing.Symbols = append(existing.Symbols, document.Symbols...)
+		existing.Occurrences = append(existing.Occurrences, document.Occurrences...)
+	}
+
+	flattened := make([]*Document, 0, len(documentMap))
+	for _, document := range documentMap {
+		flattened = append(flattened, document)
+	}
+
+	return flattened
+}
+
+// FlattenSymbol merges elements of the given slice with the same symbol name. This allows us to make the
+// assumption post-canonicalization that each index and document refer to one symbol metadata object uniquely.
+// This function returns a new slice.
+func FlattenSymbols(symbols []*SymbolInformation) []*SymbolInformation {
+	symbolMap := make(map[string]*SymbolInformation, len(symbols))
+	for _, symbol := range symbols {
+		existing, ok := symbolMap[symbol.Symbol]
+		if !ok {
+			symbolMap[symbol.Symbol] = symbol
+			continue
+		}
+
+		existing.Documentation = combineDocumentation(existing.Documentation, symbol.Documentation)
+		existing.Relationships = append(existing.Relationships, symbol.Relationships...)
+	}
+
+	flattened := make([]*SymbolInformation, 0, len(symbolMap))
+	for _, symbol := range symbolMap {
+		flattened = append(flattened, symbol)
+	}
+
+	return flattened
+}
+
+// FlattenOccurrences merges elements of the given slice with equivalent bounds. This function returns a new slice.
+func FlattenOccurrences(occurrences []*Occurrence) []*Occurrence {
+	if len(occurrences) == 0 {
+		return occurrences
+	}
+
+	_ = SortOccurrences(occurrences)
+	flattened := make([]*Occurrence, 0, len(occurrences))
+	flattened = append(flattened, occurrences[0])
+
+	for _, occurrence := range occurrences[1:] {
+		top := flattened[len(flattened)-1]
+
+		if !rawRangesEqual(top.Range, occurrence.Range) {
+			flattened = append(flattened, occurrence)
+			continue
+		}
+		if top.Symbol != occurrence.Symbol {
+			flattened = append(flattened, occurrence)
+			continue
+		}
+
+		if top.SyntaxKind == SyntaxKind_UnspecifiedSyntaxKind {
+			// Take first valid syntax kind
+			top.SyntaxKind = occurrence.SyntaxKind
+		}
+
+		// Combine all other fields
+		top.SymbolRoles |= occurrence.SymbolRoles
+		top.OverrideDocumentation = append(top.OverrideDocumentation, occurrence.OverrideDocumentation...)
+		top.Diagnostics = append(top.Diagnostics, occurrence.Diagnostics...)
+	}
+
+	return flattened
+}
+
+// FlattenRelationship merges elements of the given slice with equivalent symbol names. This function returns a new
+// slice.
+func FlattenRelationship(relationships []*Relationship) []*Relationship {
+	relationshipMap := make(map[string][]*Relationship, len(relationships))
+	for _, relationship := range relationships {
+		relationshipMap[relationship.Symbol] = append(relationshipMap[relationship.Symbol], relationship)
+	}
+
+	flattened := make([]*Relationship, 0, len(relationshipMap))
+	for _, relationships := range relationshipMap {
+		combined := relationships[0]
+		for _, relationship := range relationships[1:] {
+			combined.IsReference = combined.IsReference || relationship.IsReference
+			combined.IsImplementation = combined.IsImplementation || relationship.IsImplementation
+			combined.IsTypeDefinition = combined.IsTypeDefinition || relationship.IsTypeDefinition
+			combined.IsDefinition = combined.IsDefinition || relationship.IsDefinition
+		}
+
+		flattened = append(flattened, combined)
+	}
+
+	return flattened
+}
+
+// combineDocumentation merges documentation components from two separate symbol information objects.
+func combineDocumentation(existing, additional []string) []string {
+	filtered := make([]string, 0, len(additional))
+	for _, s := range additional {
+		if !stringSliceContains(existing, s) {
+			filtered = append(filtered, s)
+		}
+	}
+
+	return append(existing, filtered...)
+}
+
+func stringSliceContains(slice []string, target string) bool {
+	for _, candidate := range slice {
+		if target == candidate {
+			return true
+		}
+	}
+
+	return false
+}
@@ -0,0 +1,100 @@
+package scip
+
+import (
+	"unicode/utf8"
+)
+
+// SanitizeDocument ensures that all strings in the given document are valid UTF-8.
+// This is a requirement for successful protobuf encoding.
+func SanitizeDocument(document *Document) *Document {
+	document.Language = sanitizeString(document.Language)
+	document.RelativePath = sanitizeString(document.RelativePath)
+	document.Occurrences = SanitizeOccurrences(document.Occurrences)
+	document.Symbols = SanitizeSymbols(document.Symbols)
+	return document
+}
+
+// SanitizeOccurrences ensures that all strings in the given occurrence slice are valid UTF-8.
+// The input slice is modified in-place but returned for convenience.
+// This is a requirement for successful protobuf encoding.
+func SanitizeOccurrences(occurrences []*Occurrence) []*Occurrence {
+	for i, occurrence := range occurrences {
+		occurrences[i] = SanitizeOccurrence(occurrence)
+	}
+
+	return occurrences
+}
+
+// SanitizeOccurrence ensures that all strings in the given occurrence are valid UTF-8.
+// This is a requirement for successful protobuf encoding.
+func SanitizeOccurrence(occurrence *Occurrence) *Occurrence {
+	occurrence.Symbol = sanitizeString(occurrence.Symbol)
+	occurrence.OverrideDocumentation = sanitizeStringSlice(occurrence.OverrideDocumentation)
+	occurrence.Diagnostics = SanitizeDiagnostics(occurrence.Diagnostics)
+	return occurrence
+}
+
+// SanitizeDiagnostics ensures that all strings in the given diagnostic slice are valid UTF-8.
+// The input slice is modified in-place but returned for convenience.
+// This is a requirement for successful protobuf encoding.
+func SanitizeDiagnostics(diagnostics []*Diagnostic) []*Diagnostic {
+	for i, diagnostic := range diagnostics {
+		diagnostics[i] = SanitizeDiagnostic(diagnostic)
+	}
+
+	return diagnostics
+}
+
+// SanitizeDiagnostic ensures that all strings in the given diagnostic are valid UTF-8.
+// This is a requirement for successful protobuf encoding.
+func SanitizeDiagnostic(diagnostic *Diagnostic) *Diagnostic {
+	diagnostic.Code = sanitizeString(diagnostic.Code)
+	diagnostic.Message = sanitizeString(diagnostic.Message)
+	diagnostic.Source = sanitizeString(diagnostic.Source)
+	return diagnostic
+}
+
+// SanitizeSymbols ensures that all strings in the given symbols slice are valid UTF-8.
+// The input slice is modified in-place but returned for convenience.
+// This is a requirement for successful protobuf encoding.
+func SanitizeSymbols(symbols []*SymbolInformation) []*SymbolInformation {
+	for i, symbol := range symbols {
+		symbols[i] = SanitizeSymbol(symbol)
+	}
+
+	return symbols
+}
+
+// SanitizeSymbol ensures that all strings in the given symbol are valid UTF-8.
+// This is a requirement for successful protobuf encoding.
+func SanitizeSymbol(symbol *SymbolInformation) *SymbolInformation {
+	symbol.Symbol = sanitizeString(symbol.Symbol)
+	symbol.Documentation = sanitizeStringSlice(symbol.Documentation)
+
+	for _, relationship := range symbol.Relationships {
+		relationship.Symbol = sanitizeString(relationship.Symbol)
+	}
+
+	return symbol
+}
+
+// sanitizeStringSlice ensures the strings in the given slice are all valid UTF-8.
+// The input slice is modified in-place but returned for convenience.
+// This is a requirement for successful protobuf encoding.
+func sanitizeStringSlice(ss []string) []string {
+	for i, s := range ss {
+		ss[i] = sanitizeString(s)
+	}
+
+	return ss
+}
+
+// sanitizeString coerces a string into valid UTF-8 (if it's not already).
+func sanitizeString(s string) string {
+	if utf8.ValidString(s) {
+		return s
+	}
+
+	// This seems redundant, but it isn't: it magically makes the string valid UTF-8
+	return string([]rune(s))
+}