treat paths in history based reindex correctly

vladak · vladak · commit 520ce3b90865 · 2024-02-19T10:25:54.000+01:00
fixes #4317
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/history/FileCollector.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/history/FileCollector.java
@@ -18,15 +18,15 @@
  */
 
 /*
- * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
  */
 package org.opengrok.indexer.history;
 
 import org.jetbrains.annotations.VisibleForTesting;
 
 import java.util.Collection;
-import java.util.SortedSet;
-import java.util.TreeSet;
+import java.util.HashSet;
+import java.util.Set;
 
 /**
  * This class is meant to collect files that were touched in some way by SCM update.
@@ -36,14 +36,11 @@
  * in one changeset a file may be deleted, only to be re-added in the next changeset etc.
  */
 public class FileCollector extends ChangesetVisitor {
-    private final SortedSet<String> files;
+    private final Set<String> files;
 
-    /**
-     * Assumes comparing in the same way as {@code org.opengrok.indexer.index.IndexDatabase#FILENAME_COMPARATOR}.
-     */
     public FileCollector(boolean consumeMergeChangesets) {
         super(consumeMergeChangesets);
-        files = new TreeSet<>();
+        files = new HashSet<>();
     }
 
     public void accept(RepositoryWithHistoryTraversal.ChangesetInfo changesetInfo) {
@@ -58,7 +55,10 @@ public void accept(RepositoryWithHistoryTraversal.ChangesetInfo changesetInfo) {
         }
     }
 
-    public SortedSet<String> getFiles() {
+    /**
+     * @return set of file paths relative to source root. There are no guarantees w.r.t. ordering.
+     */
+    public Set<String> getFiles() {
         return files;
     }
 
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java
@@ -185,6 +185,8 @@ private void checkProjectsParallel(IndexCheckMode mode, Set<String> projectNames
                     // It will be thrown once all the checks complete.
                     LOGGER.log(Level.WARNING, "could not perform index check", exception);
                     ioException = (IOException) exception;
+                } else {
+                    throw new IndexCheckException(exception);
                 }
             }
         }
@@ -515,19 +517,30 @@ static List<Path> getLiveDocumentPaths(Path indexPath) throws IOException {
 
             Bits liveDocs = MultiBits.getLiveDocs(indexReader);
 
+            LOGGER.log(Level.FINEST, "maxDoc = {0}", indexReader.maxDoc());
             for (int i = 0; i < indexReader.maxDoc(); i++) {
                 Document doc = indexReader.storedFields().document(i);
 
                 // liveDocs is null if the index has no deletions.
                 if (liveDocs != null && !liveDocs.get(i)) {
+                    IndexableField field = doc.getField(QueryBuilder.U);
+                    if (field != null) {
+                        String uidString = field.stringValue();
+                        LOGGER.log(Level.FINEST, "ignoring ''{0}'' at {1}",
+                                new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
+                    } else {
+                        LOGGER.log(Level.FINEST, "ignoring {0}", doc);
+                    }
                     continue;
                 }
 
                 // This should avoid the special LOC documents.
                 IndexableField field = doc.getField(QueryBuilder.U);
                 if (field != null) {
-                    String uid = field.stringValue();
-                    livePaths.add(Path.of(Util.uid2url(uid)));
+                    String uidString = field.stringValue();
+                    LOGGER.log(Level.FINEST, "live doc: ''{0}'' at {1}",
+                            new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
+                    livePaths.add(Path.of(Util.uid2url(uidString)));
                 }
             }
 
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheckException.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheckException.java
@@ -45,6 +45,10 @@ public IndexCheckException(String message, Set<Path> paths) {
         failedPaths.addAll(paths);
     }
 
+    public IndexCheckException(Exception e) {
+        super(e);
+    }
+
     /**
      * @return set of source paths which failed the check
      */
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java
@@ -134,7 +134,24 @@ public class IndexDatabase {
 
     private static final Logger LOGGER = LoggerFactory.getLogger(IndexDatabase.class);
 
-    private static final Comparator<File> FILENAME_COMPARATOR = Comparator.comparing(File::getName);
+    @VisibleForTesting
+    static final Comparator<File> FILENAME_COMPARATOR = Comparator.comparing(File::getName);
+
+    @VisibleForTesting
+    static final Comparator<Path> FILEPATH_COMPARATOR = (p1, p2) -> {
+        int nameCount = Math.min(p1.getNameCount(), p2.getNameCount());
+        int i;
+        for (i = 0; i < nameCount; i++) {
+            var c1 = p1.getName(i).toString();
+            var c2 = p2.getName(i).toString();
+            if (c1.equals(c2)) {
+                continue;
+            }
+            return c1.compareTo(c2);
+        }
+
+        return Integer.compare(p1.getNameCount(), p2.getNameCount());
+    };
 
     private static final Set<String> CHECK_FIELDS;
 
@@ -197,6 +214,22 @@ public IndexDatabase() throws IOException {
         this(null);
     }
 
+    /**
+     * Anyone using this constructor is supposed to never call {@link #update()}.
+     * Do not use for anything besides testing.
+     * @param uidIter uid iterator
+     * @param writer index writer
+     * @throws IOException on error
+     */
+    @VisibleForTesting
+    IndexDatabase(Project project, TermsEnum uidIter, IndexWriter writer) throws IOException {
+        this(project, new IndexDownArgsFactory());
+        this.uidIter = uidIter;
+        this.writer = writer;
+        this.completer = new PendingFileCompleter();
+        initialize();
+    }
+
     /**
      * Create a new instance of an Index Database for a given project.
      *
@@ -709,8 +742,7 @@ public void update() throws IOException {
                         if (stat == TermsEnum.SeekStatus.END) {
                             uidIter = null;
                             LOGGER.log(Level.WARNING,
-                                "Couldn''t find a start term for {0}, empty u field?",
-                                startUid);
+                                "Couldn''t find a start term for {0}, empty u field?", startUid);
                         }
                     }
 
@@ -819,19 +851,25 @@ private void setupDeletedUids() throws IOException {
         Statistics stat = new Statistics();
         LOGGER.log(Level.FINEST, "traversing the documents in {0} to collect uids of deleted documents",
                 indexDirectory);
+        StoredFields storedFields = reader.storedFields();
         for (int i = 0; i < reader.maxDoc(); i++) {
+            Document doc = storedFields.document(i, LIVE_CHECK_FIELDS);  // use limited-field version
+            IndexableField field = doc.getField(QueryBuilder.U);
             if (!liveDocs.get(i)) {
-                StoredFields storedFields = reader.storedFields();
-                Document doc = storedFields.document(i, LIVE_CHECK_FIELDS);  // use limited-field version
-                IndexableField field = doc.getField(QueryBuilder.U);
                 if (field != null) {
                     if (LOGGER.isLoggable(Level.FINEST)) {
                         String uidString = field.stringValue();
-                        LOGGER.log(Level.FINEST, "adding ''{0}'' at {1} to deleted uid set",
-                                new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
+                        LOGGER.log(Level.FINEST, "adding ''{0}'' ({2}) at {1} to deleted uid set",
+                                new Object[]{Util.uid2url(uidString), Util.uid2date(uidString), i});
                     }
                     deletedUids.add(field.stringValue());
                 }
+            } else {
+                if (field != null) {
+                    String uidString = field.stringValue();
+                    LOGGER.log(Level.FINEST, "live doc: ''{0}'' ({2}) at {1}",
+                            new Object[]{Util.uid2url(uidString), Util.uid2date(uidString), i});
+                }
             }
         }
         stat.report(LOGGER, Level.FINEST, String.format("found %s deleted documents in %s",
@@ -931,12 +969,17 @@ void indexDownUsingHistory(File sourceRoot, IndexDownArgs args) throws IOExcepti
 
         try (Progress progress = new Progress(LOGGER, String.format("collecting files for %s", project),
                 fileCollector.getFiles().size())) {
-            for (String path : fileCollector.getFiles()) {
+            List<Path> paths = fileCollector.getFiles().stream().
+                    map(Path::of).
+                    sorted(FILEPATH_COMPARATOR).
+                    collect(Collectors.toList());
+            LOGGER.log(Level.FINEST, "collected sorted files: {0}", paths);
+            for (Path path : paths) {
                 if (isInterrupted()) {
                     return;
                 }
-                File file = new File(sourceRoot, path);
-                processFileHistoryBased(args, file, path);
+                File file = new File(sourceRoot, path.toString());
+                processFileHistoryBased(args, file, path.toString());
                 progress.increment();
             }
         }
@@ -1096,16 +1139,17 @@ private void removeAnnotationFile(String path) {
      * and queue the removal of the associated xref file.
      *
      * @param removeHistory if false, do not remove history cache for this file
+     * @return deleted uid (as string)
      * @throws java.io.IOException if an error occurs
      */
-    private void removeFile(boolean removeHistory) throws IOException {
+    private String removeFile(boolean removeHistory) throws IOException {
         String path = Util.uid2url(uidIter.term().utf8ToString());
 
         for (IndexChangedListener listener : listeners) {
             listener.fileRemove(path);
         }
 
-        removeFileDocUid(path);
+        String deletedUid = removeFileDocUid(path);
 
         removeXrefFile(path);
 
@@ -1122,9 +1166,11 @@ private void removeFile(boolean removeHistory) throws IOException {
         for (IndexChangedListener listener : listeners) {
             listener.fileRemoved(path);
         }
+
+        return deletedUid;
     }
 
-    private void removeFileDocUid(String path) throws IOException {
+    private String removeFileDocUid(String path) throws IOException {
 
         // Determine if a reversal of counts is necessary, and execute if so.
         if (isCountingDeltas) {
@@ -1141,6 +1187,8 @@ private void removeFileDocUid(String path) throws IOException {
         }
 
         writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
+
+        return uidIter.term().utf8ToString();
     }
 
     private void decrementLOCforDoc(String path, Document doc) {
@@ -1648,6 +1696,17 @@ void indexDown(File dir, String parent, IndexDownArgs args, Progress progress) t
         }
     }
 
+    /**
+     * wrapper for fatal errors during indexing.
+     */
+    public static class IndexerFault extends RuntimeException {
+        private static final long serialVersionUID = -1;
+
+        public IndexerFault(String message) {
+            super(message);
+        }
+    }
+
     /**
      * Compared with {@link #processFile(IndexDownArgs, File, String)}, this method's file/path arguments
      * represent files that have actually changed in some way, while the other method's argument represent
@@ -1660,12 +1719,14 @@ void indexDown(File dir, String parent, IndexDownArgs args, Progress progress) t
     @VisibleForTesting
     void processFileHistoryBased(IndexDownArgs args, File file, String path) throws IOException {
         final boolean fileExists = file.exists();
-
+        final Set<String> deletedUidsHere = new HashSet<>();
         path = Util.fixPathIfWindows(path);
+
         // Traverse terms until reaching document beyond path of given file.
-        while (uidIter != null && uidIter.term() != null
-                && uidIter.term().compareTo(emptyBR) != 0
-                && Util.uid2url(uidIter.term().utf8ToString()).compareTo(path) <= 0) {
+        while (uidIter != null && uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0
+                && FILEPATH_COMPARATOR.compare(
+                        Path.of(Util.uid2url(uidIter.term().utf8ToString())),
+                        Path.of(path)) <= 0) {
 
             if (deletedUids.contains(uidIter.term().utf8ToString())) {
                 logIgnoredUid(uidIter.term().utf8ToString());
@@ -1688,9 +1749,10 @@ void processFileHistoryBased(IndexDownArgs args, File file, String path) throws
                 if (!matchOK) {
                     removeFile(false);
                     addWorkHistoryBased(args, termFile, termPath);
+                    deletedUidsHere.add(removeFile(false));
                 }
             } else {
-                removeFile(!fileExists);
+                deletedUidsHere.add(removeFile(!fileExists));
             }
 
             BytesRef next = uidIter.next();
@@ -1703,6 +1765,18 @@ void processFileHistoryBased(IndexDownArgs args, File file, String path) throws
         // That said, it is necessary to check whether the file can be accepted. This is done in the function below.
         // Also, allow for broken symbolic links (File.exists() returns false for these).
         if (fileExists || Files.isSymbolicLink(file.toPath())) {
+            // This assumes that the last modified time is indeed what the indexer uses when adding the document.
+            String time = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND);
+            if (deletedUidsHere.contains(Util.path2uid(path, time))) {
+                //
+                // Adding document with the same date of a pre-existing document which is being removed
+                // will lead to index corruption (duplicate documents). Hence, make the indexer to fail hard.
+                //
+                throw new IndexerFault(
+                        String.format("attempting to add file '%s' with date matching deleted document: %s",
+                                path, time));
+            }
+
             addWorkHistoryBased(args, file, path);
         }
     }
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java
@@ -287,7 +287,12 @@ public static void main(String[] argv) {
                     System.exit(2);
                 } catch (IndexCheckException e) {
                     System.err.printf("Index check failed%n");
-                    System.err.print("You might want to remove " + e.getFailedPaths());
+                    if (!e.getFailedPaths().isEmpty()) {
+                        System.err.print("You might want to remove " + e.getFailedPaths());
+                    } else {
+                        System.err.println("with exception: " + e);
+                        e.printStackTrace(System.err);
+                    }
                     System.exit(1);
                 }
 
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/Util.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/Util.java
@@ -823,7 +823,7 @@ private static void writeAnnotation(int num, Writer out, Annotation annotation,
      * walk of the file hierarchy. Thus, null character (\u0000) is used both to
      * separate directory components and to separate the path from the date.
      *
-     * @param path path to mangle.
+     * @param path path to mangle. This is assumed to be relative to source root.
      * @param date date string to use.
      * @return the mangled path.
      */
diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexDatabasePathComparatorTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexDatabasePathComparatorTest.java
diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexDatabaseTestHistBasedIterationTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexDatabaseTestHistBasedIterationTest.java

Original file line number	Diff line number	Diff line change
`@@ -823,7 +823,7 @@ private static void writeAnnotation(int num, Writer out, Annotation annotation,`
`823`	`823`	`* walk of the file hierarchy. Thus, null character (\u0000) is used both to`
`824`	`824`	`* separate directory components and to separate the path from the date.`
`825`	`825`	`*`
`826`		`- * @param path path to mangle.`
	`826`	`+ * @param path path to mangle. This is assumed to be relative to source root.`
`827`	`827`	`* @param date date string to use.`
`828`	`828`	`* @return the mangled path.`
`829`	`829`	`*/`