Don't delete /indexes at the beginning of index_build.py

OSS-Fuzz Team · copybara-github · commit ddf6b514974c · 2025-07-16T07:14:07.000-07:00
Doing this breaks re-running the snapshotting for some projects when there are no
changes, because some OSS-Fuzz build systems won't relink the fuzzer binary in those
cases.

If we kept /indexes, then the index_build.py can just re-use the previous
indexing result even if linking didn't happen again.

However, keeping them has problems:
1. The indexer won't like it if destination files already exist.
2. We'll pick up stale indexes on a rebuild when there are actual changes.

We solve 1. by only deleting the specific index directory when the indexer is
about to run.

For 2. we try to be a bit smarter about selecting binaries/indexes in
index_build.py, to avoid picking up stale indexes with mismatching build IDs.

PiperOrigin-RevId: 783745887
diff --git a/infra/base-images/base-builder/indexer/clang_wrapper.py b/infra/base-images/base-builder/indexer/clang_wrapper.py
@@ -26,6 +26,7 @@
 import os
 from pathlib import Path  # pylint: disable=g-importing-member
 import shlex
+import shutil
 import subprocess
 import sys
 import time
@@ -282,8 +283,13 @@ def read_cdb_fragments(cdb_path: Path) -> Any:
 def run_indexer(build_id: str, linker_commands: dict[str, Any]):
   """Run the indexer."""
   index_dir = INDEXES_PATH / build_id
-  # TODO: check if this is correct.
-  index_dir.mkdir(exist_ok=True)
+  if index_dir.exists():
+    # A previous indexer already ran for the same build ID.  Clear the directory
+    # so we can re-run the indexer, otherwise we might run into various issues
+    # (e.g. the indexer doesn't like it when source files already exist).
+    shutil.rmtree(index_dir)
+
+  index_dir.mkdir()
 
   # Use a build-specific compile commands directory, since there could be
   # parallel linking happening at the same time.
diff --git a/infra/base-images/base-builder/indexer/index_build.py b/infra/base-images/base-builder/indexer/index_build.py
@@ -100,6 +100,7 @@ class BinaryMetadata:
   binary_args: list[str]
   binary_env: dict[str, str]
   build_id: str
+  build_id_matches: bool
   compile_commands: list[dict[str, Any]]
   harness_kind: manifest_types.HarnessKind
 
@@ -228,8 +229,8 @@ def enumerate_build_targets(
   logging.info('enumerate_build_targets')
   linker_json_paths = list((OUT / 'cdb').glob('*_linker_commands.json'))
 
-  targets = []
   logging.info('Found %i linker JSON files.', len(linker_json_paths))
+  binary_to_build_metadata: dict[str, BinaryMetadata] = {}
   for linker_json_path in linker_json_paths:
     build_id = linker_json_path.name.split('_')[0]
     with linker_json_path.open('rt') as f:
@@ -241,7 +242,26 @@ def enumerate_build_targets(
       # the binary path and checking the build id should improve the success
       # rate.
       if (OUT / name).exists():
-        binary_paths = [binary_path]
+        # Just because the name matches, doesn't mean it's the right one for
+        # this linker command.
+        # Only set this if we haven't already found an exact build ID match.
+        # We can't always rely on build ID matching, because some builds will
+        # modify the binary after the linker runs.
+        if (
+            name in binary_to_build_metadata
+            and binary_to_build_metadata[name].build_id_matches
+        ):
+          continue
+
+        binary_to_build_metadata[name] = BinaryMetadata(
+            name=name,
+            binary_args=binary_args,
+            binary_env=binary_env,
+            compile_commands=data['compile_commands'],
+            build_id=build_id,
+            build_id_matches=build_id == get_build_id(binary_path.as_posix()),
+            harness_kind=harness_kind,
+        )
       else:
         logging.info('trying to find %s with build id %s', name, build_id)
         binary_paths = find_fuzzer_binaries(OUT, build_id)
@@ -250,21 +270,19 @@ def enumerate_build_targets(
           logging.error('could not find %s with build id %s', name, build_id)
           continue
 
-      for binary_path in binary_paths:
-        compile_commands = data['compile_commands']
-
-        targets.append(
-            BinaryMetadata(
-                name=binary_path.name,
-                binary_args=binary_args,
-                binary_env=binary_env,
-                compile_commands=compile_commands,
-                build_id=build_id,
-                harness_kind=harness_kind,
-            )
-        )
+        for binary_path in binary_paths:
+          compile_commands = data['compile_commands']
+          binary_to_build_metadata[binary_path.name] = BinaryMetadata(
+              name=binary_path.name,
+              binary_args=binary_args,
+              binary_env=binary_env,
+              compile_commands=compile_commands,
+              build_id=build_id,
+              build_id_matches=True,
+              harness_kind=harness_kind,
+          )
 
-  return targets
+  return tuple(binary_to_build_metadata.values())
 
 
 def copy_fuzzing_engine() -> Path:
@@ -658,10 +676,7 @@ def main():
   )
   args = parser.parse_args()
 
-  # Clear existing indexer artifacts.
-  if INDEXES_PATH.exists():
-    shutil.rmtree(INDEXES_PATH)
-  INDEXES_PATH.mkdir()
+  INDEXES_PATH.mkdir(exist_ok=True)
 
   # Clean up the existing OUT by default, otherwise we may run into various
   # build errors.