[HWASan] Improve symbol indexing (#135967)

StefanBossbaly · Stefan Bossbaly · web-flow · commit 0cf3c437c18e · 2025-05-23T09:43:04.000-07:00
Previously we would add any ELF that contained a build id regardless whether the ELF contained symbols or not. This works for Android since soong will strip the symbols into a new directory. However other build systems, like BUCK, will write the stripped file in the same directory as the unstripped file. This would cause the hwasan_symbolize script sometimes add then stripped ELF to its index and ignore the symbolized ELF. The logic has now been changed to only add ELFs that contain symbols to the index. If two symbolized ELFs are encountered with the same build id, we now exit out with an error. Fixes #135966 --------- Co-authored-by: Stefan Bossbaly <sboss@meta.com>
diff --git a/compiler-rt/lib/hwasan/scripts/hwasan_symbolize b/compiler-rt/lib/hwasan/scripts/hwasan_symbolize
@@ -16,6 +16,7 @@ from __future__ import unicode_literals
 
 import argparse
 import glob
+import hashlib
 import html
 import json
 import mmap
@@ -37,8 +38,9 @@ if sys.version_info.major < 3:
 Ehdr_size = 64
 e_shnum_offset = 60
 e_shoff_offset = 40
-
+e_shstrndx_offset = 62
 Shdr_size = 64
+sh_name_offset = 0
 sh_type_offset = 4
 sh_offset_offset = 24
 sh_size_offset = 32
@@ -62,33 +64,70 @@ def handle_Nhdr(mv, sh_size):
     offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
   return None
 
-def handle_Shdr(mv):
+def handle_shstrtab(mv, e_shoff):
+  e_shstrndx, = struct.unpack_from('<H', buffer=mv, offset=e_shstrndx_offset)
+  
+  start_shstrndx = e_shoff + e_shstrndx * Shdr_size
+  shstrndx_sh = mv[start_shstrndx: start_shstrndx + Shdr_size]
+  _, shstrndx_sh_offset, shstrndx_sh_size = handle_Shdr(shstrndx_sh)
+  return mv[shstrndx_sh_offset:shstrndx_sh_offset + shstrndx_sh_size]
+
+def read_string(mv):
+  name = ""
+  for byte in mv:
+    char = chr(byte)
+    if char == '\x00':
+      break
+    name += char
+  return name
+
+def unpack_sh_type(mv):
   sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
-  if sh_type != SHT_NOTE:
-    return None, None
+  return sh_type
+
+def handle_Shdr(mv):
+  name_offset, = struct.unpack_from('<I', buffer=mv, offset=sh_name_offset)
   sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
   sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
-  return sh_offset, sh_size
+  return name_offset, sh_offset, sh_size
 
 def handle_elf(mv):
   # \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
   # 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will
   # have to extend the parsing code.
   if mv[:6] != b'\x7fELF\x02\x01':
     return None
+  found_symbols = False
+  bid = None
   e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
   e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
+
+  # Section where all the section header names are stored.
+  shstr = handle_shstrtab(mv, e_shoff)
+
   for i in range(0, e_shnum):
     start = e_shoff + i * Shdr_size
-    sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
-    if sh_offset is None:
-      continue
-    note_hdr = mv[sh_offset: sh_offset + sh_size]
-    result = handle_Nhdr(note_hdr, sh_size)
-    if result is not None:
-      return result
+    sh = mv[start: start + Shdr_size]
+    sh_name_offset, sh_offset, sh_size = handle_Shdr(sh)
+    sh_name = read_string(shstr[sh_name_offset:])
+    sh_type = unpack_sh_type(sh)
+
+    if sh_name == ".debug_info":
+      found_symbols = True
+    if sh_type == SHT_NOTE:
+      if sh_offset is None:
+        continue
+      note_hdr = mv[sh_offset: sh_offset + sh_size]
+      result = handle_Nhdr(note_hdr, sh_size)
+      if result is not None:
+        bid = result
+
+  if found_symbols:
+    return bid
+  else:
+    return None
 
-def get_buildid(filename):
+def read_elf(filename):
   with open(filename, "r") as fd:
     if os.fstat(fd.fileno()).st_size < Ehdr_size:
       return None
@@ -200,7 +239,7 @@ class Symbolizer:
       if os.path.exists(full_path):
         return full_path
     if name not in self.__warnings:
-      print("Could not find symbols for", name, file=sys.stderr)
+      print("Could not find symbols for {} (Build ID: {})".format(name, buildid), file=sys.stderr)
       self.__warnings.add(name)
     return None
 
@@ -268,13 +307,30 @@ class Symbolizer:
         for fn in fnames:
           filename = os.path.join(dname, fn)
           try:
-            bid = get_buildid(filename)
+            bid = read_elf(filename)
           except FileNotFoundError:
             continue
           except Exception as e:
             print("Failed to parse {}: {}".format(filename, e), file=sys.stderr)
             continue
-          if bid is not None:
+          if bid is None:
+            continue
+
+          if bid in self.__index:
+            index_filename = self.__index[bid]
+
+            if os.path.samefile(index_filename, filename):
+              continue
+
+            with open(filename, "rb") as f:
+              file_hash = hashlib.file_digest(f, "sha256")
+
+            with open(index_filename, "rb") as f:
+              index_file_hash = hashlib.file_digest(f, "sha256")
+
+            if index_file_hash.digest() != file_hash.digest():
+              print("Build ID collision! Files share the same BuildId ({}) but their contents differ. Files {} and {} ".format(bid, filename, index_filename), file=sys.stderr)
+          else:
             self.__index[bid] = filename
 
   def symbolize_line(self, line):