sourcegraph
diff --git a/‎bindings/go/scip/scip.pb.go
Lines changed: 513 additions & 402 deletions b/‎bindings/go/scip/scip.pb.go
Lines changed: 513 additions & 402 deletions
diff --git a/‎bindings/haskell/src/Proto/Scip.hs
Lines changed: 1259 additions & 1042 deletions b/‎bindings/haskell/src/Proto/Scip.hs
Lines changed: 1259 additions & 1042 deletions
diff --git a/‎bindings/haskell/src/Proto/Scip_Fields.hs
Lines changed: 6 additions & 0 deletions b/‎bindings/haskell/src/Proto/Scip_Fields.hs
Lines changed: 6 additions & 0 deletions
diff --git a/‎bindings/rust/src/generated/scip.rs
Lines changed: 1473 additions & 1326 deletions b/‎bindings/rust/src/generated/scip.rs
Lines changed: 1473 additions & 1326 deletions
diff --git a/‎bindings/typescript/scip.ts
Lines changed: 29 additions & 0 deletions b/‎bindings/typescript/scip.ts
Lines changed: 29 additions & 0 deletions
diff --git a/‎docs/scip.md
Lines changed: 61 additions & 6 deletions b/‎docs/scip.md
Lines changed: 61 additions & 6 deletions
diff --git a/‎scip.proto
Lines changed: 43 additions & 1 deletion b/‎scip.proto
Lines changed: 43 additions & 1 deletion
@@ -45,7 +45,8 @@ message Metadata {
   // directory.
   string project_root = 3;
   // Text encoding of the source files on disk that are referenced from
-  // `Document.relative_path`.
+  // `Document.relative_path`. This value is unrelated to the `Document.text`
+  // field, which is a Protobuf string and hence must be UTF-8 encoded.
   TextEncoding text_document_encoding = 4;
 }
 
@@ -102,8 +103,46 @@ message Document {
   // can be used for other purposes as well, for example testing or when working
   // with virtual/in-memory documents.
   string text = 5;
+
+  // Specifies the encoding used for source ranges in this Document.
+  //
+  // Usually, this will match the type used to index the string type
+  // in the indexer's implementation language in O(1) time.
+  // - For an indexer implemented in JVM/.NET language or JavaScript/TypeScript,
+  //   use UTF16CodeUnitOffsetFromLineStart.
+  // - For an indexer implemented in Python,
+  //   use UTF8CodeUnitOffsetFromLineStart.
+  // - For an indexer implemented in Go, Rust or C++,
+  //   use UTF8ByteOffsetFromLineStart.
+  PositionEncoding position_encoding = 6;
 }
 
+// Encoding used to interpret the 'character' value in source ranges.
+enum PositionEncoding {
+  // Default value. This value should not be used by new SCIP indexers
+  // so that a consumer can process the SCIP index without ambiguity.
+  UnspecifiedPositionEncoding = 0;
+  // The 'character' value is interpreted as a byte offset,
+  // assuming that the text for the line is encoded as UTF-8.
+  //
+  // Example: For the string "🚀 Woo" in UTF-8, the bytes are 
+  // [240, 159, 154, 128, 32, 87, 111, 111], so the offset for 'W'
+  // would be 5.
+  UTF8ByteOffsetFromLineStart = 1;
+  // The 'character' value is interpreted as an offset in terms
+  // of UTF-8 code units.
+  //
+  // Example: For the string "🚀 Woo", the UTF-8 code units are
+  // ['🚀', ' ', 'W', 'o', 'o'], so the offset for 'W' would be 2.
+  UTF8CodeUnitOffsetFromLineStart = 2;
+  // The 'character' value is interpreted as an offset in terms
+  // of UTF-16 code units.
+  //
+  // Example: For the string "🚀 Woo", the UTF-16 code units are
+  // ['\ud83d', '\ude80', ' ', 'W', 'o', 'o'], so the offset for 'W'
+  // would be 3.
+  UTF16CodeUnitOffsetFromLineStart = 3;
+}
 
 // Symbol is similar to a URI, it identifies a class, method, or a local
 // variable. `SymbolInformation` contains rich metadata about symbols such as
@@ -594,6 +633,9 @@ message Occurrence {
   // line/character values before displaying them in an editor-like UI because
   // editors conventionally use 1-based numbers.
   //
+  // The 'character' value is interpreted based on the PositionEncoding for
+  // the Document.
+  //
   // Historical note: the original draft of this schema had a `Range` message
   // type with `start` and `end` fields of type `Position`, mirroring LSP.
   // Benchmarks revealed that this encoding was inefficient and that we could