Skip to content

Commit 520ce3b

Browse files
committed
treat paths in history based reindex correctly
fixes #4317
1 parent b753f02 commit 520ce3b

File tree

8 files changed

+507
-32
lines changed

8 files changed

+507
-32
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/history/FileCollector.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved.
2222
*/
2323
package org.opengrok.indexer.history;
2424

2525
import org.jetbrains.annotations.VisibleForTesting;
2626

2727
import java.util.Collection;
28-
import java.util.SortedSet;
29-
import java.util.TreeSet;
28+
import java.util.HashSet;
29+
import java.util.Set;
3030

3131
/**
3232
* This class is meant to collect files that were touched in some way by SCM update.
@@ -36,14 +36,11 @@
3636
* in one changeset a file may be deleted, only to be re-added in the next changeset etc.
3737
*/
3838
public class FileCollector extends ChangesetVisitor {
39-
private final SortedSet<String> files;
39+
private final Set<String> files;
4040

41-
/**
42-
* Assumes comparing in the same way as {@code org.opengrok.indexer.index.IndexDatabase#FILENAME_COMPARATOR}.
43-
*/
4441
public FileCollector(boolean consumeMergeChangesets) {
4542
super(consumeMergeChangesets);
46-
files = new TreeSet<>();
43+
files = new HashSet<>();
4744
}
4845

4946
public void accept(RepositoryWithHistoryTraversal.ChangesetInfo changesetInfo) {
@@ -58,7 +55,10 @@ public void accept(RepositoryWithHistoryTraversal.ChangesetInfo changesetInfo) {
5855
}
5956
}
6057

61-
public SortedSet<String> getFiles() {
58+
/**
59+
* @return set of file paths relative to source root. There are no guarantees w.r.t. ordering.
60+
*/
61+
public Set<String> getFiles() {
6262
return files;
6363
}
6464

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ private void checkProjectsParallel(IndexCheckMode mode, Set<String> projectNames
185185
// It will be thrown once all the checks complete.
186186
LOGGER.log(Level.WARNING, "could not perform index check", exception);
187187
ioException = (IOException) exception;
188+
} else {
189+
throw new IndexCheckException(exception);
188190
}
189191
}
190192
}
@@ -515,19 +517,30 @@ static List<Path> getLiveDocumentPaths(Path indexPath) throws IOException {
515517

516518
Bits liveDocs = MultiBits.getLiveDocs(indexReader);
517519

520+
LOGGER.log(Level.FINEST, "maxDoc = {0}", indexReader.maxDoc());
518521
for (int i = 0; i < indexReader.maxDoc(); i++) {
519522
Document doc = indexReader.storedFields().document(i);
520523

521524
// liveDocs is null if the index has no deletions.
522525
if (liveDocs != null && !liveDocs.get(i)) {
526+
IndexableField field = doc.getField(QueryBuilder.U);
527+
if (field != null) {
528+
String uidString = field.stringValue();
529+
LOGGER.log(Level.FINEST, "ignoring ''{0}'' at {1}",
530+
new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
531+
} else {
532+
LOGGER.log(Level.FINEST, "ignoring {0}", doc);
533+
}
523534
continue;
524535
}
525536

526537
// This should avoid the special LOC documents.
527538
IndexableField field = doc.getField(QueryBuilder.U);
528539
if (field != null) {
529-
String uid = field.stringValue();
530-
livePaths.add(Path.of(Util.uid2url(uid)));
540+
String uidString = field.stringValue();
541+
LOGGER.log(Level.FINEST, "live doc: ''{0}'' at {1}",
542+
new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
543+
livePaths.add(Path.of(Util.uid2url(uidString)));
531544
}
532545
}
533546

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheckException.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ public IndexCheckException(String message, Set<Path> paths) {
4545
failedPaths.addAll(paths);
4646
}
4747

48+
public IndexCheckException(Exception e) {
49+
super(e);
50+
}
51+
4852
/**
4953
* @return set of source paths which failed the check
5054
*/

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java

Lines changed: 93 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,24 @@ public class IndexDatabase {
134134

135135
private static final Logger LOGGER = LoggerFactory.getLogger(IndexDatabase.class);
136136

137-
private static final Comparator<File> FILENAME_COMPARATOR = Comparator.comparing(File::getName);
137+
@VisibleForTesting
138+
static final Comparator<File> FILENAME_COMPARATOR = Comparator.comparing(File::getName);
139+
140+
@VisibleForTesting
141+
static final Comparator<Path> FILEPATH_COMPARATOR = (p1, p2) -> {
142+
int nameCount = Math.min(p1.getNameCount(), p2.getNameCount());
143+
int i;
144+
for (i = 0; i < nameCount; i++) {
145+
var c1 = p1.getName(i).toString();
146+
var c2 = p2.getName(i).toString();
147+
if (c1.equals(c2)) {
148+
continue;
149+
}
150+
return c1.compareTo(c2);
151+
}
152+
153+
return Integer.compare(p1.getNameCount(), p2.getNameCount());
154+
};
138155

139156
private static final Set<String> CHECK_FIELDS;
140157

@@ -197,6 +214,22 @@ public IndexDatabase() throws IOException {
197214
this(null);
198215
}
199216

217+
/**
218+
* Anyone using this constructor is supposed to never call {@link #update()}.
219+
* Do not use for anything besides testing.
220+
* @param uidIter uid iterator
221+
* @param writer index writer
222+
* @throws IOException on error
223+
*/
224+
@VisibleForTesting
225+
IndexDatabase(Project project, TermsEnum uidIter, IndexWriter writer) throws IOException {
226+
this(project, new IndexDownArgsFactory());
227+
this.uidIter = uidIter;
228+
this.writer = writer;
229+
this.completer = new PendingFileCompleter();
230+
initialize();
231+
}
232+
200233
/**
201234
* Create a new instance of an Index Database for a given project.
202235
*
@@ -709,8 +742,7 @@ public void update() throws IOException {
709742
if (stat == TermsEnum.SeekStatus.END) {
710743
uidIter = null;
711744
LOGGER.log(Level.WARNING,
712-
"Couldn''t find a start term for {0}, empty u field?",
713-
startUid);
745+
"Couldn''t find a start term for {0}, empty u field?", startUid);
714746
}
715747
}
716748

@@ -819,19 +851,25 @@ private void setupDeletedUids() throws IOException {
819851
Statistics stat = new Statistics();
820852
LOGGER.log(Level.FINEST, "traversing the documents in {0} to collect uids of deleted documents",
821853
indexDirectory);
854+
StoredFields storedFields = reader.storedFields();
822855
for (int i = 0; i < reader.maxDoc(); i++) {
856+
Document doc = storedFields.document(i, LIVE_CHECK_FIELDS); // use limited-field version
857+
IndexableField field = doc.getField(QueryBuilder.U);
823858
if (!liveDocs.get(i)) {
824-
StoredFields storedFields = reader.storedFields();
825-
Document doc = storedFields.document(i, LIVE_CHECK_FIELDS); // use limited-field version
826-
IndexableField field = doc.getField(QueryBuilder.U);
827859
if (field != null) {
828860
if (LOGGER.isLoggable(Level.FINEST)) {
829861
String uidString = field.stringValue();
830-
LOGGER.log(Level.FINEST, "adding ''{0}'' at {1} to deleted uid set",
831-
new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
862+
LOGGER.log(Level.FINEST, "adding ''{0}'' ({2}) at {1} to deleted uid set",
863+
new Object[]{Util.uid2url(uidString), Util.uid2date(uidString), i});
832864
}
833865
deletedUids.add(field.stringValue());
834866
}
867+
} else {
868+
if (field != null) {
869+
String uidString = field.stringValue();
870+
LOGGER.log(Level.FINEST, "live doc: ''{0}'' ({2}) at {1}",
871+
new Object[]{Util.uid2url(uidString), Util.uid2date(uidString), i});
872+
}
835873
}
836874
}
837875
stat.report(LOGGER, Level.FINEST, String.format("found %s deleted documents in %s",
@@ -931,12 +969,17 @@ void indexDownUsingHistory(File sourceRoot, IndexDownArgs args) throws IOExcepti
931969

932970
try (Progress progress = new Progress(LOGGER, String.format("collecting files for %s", project),
933971
fileCollector.getFiles().size())) {
934-
for (String path : fileCollector.getFiles()) {
972+
List<Path> paths = fileCollector.getFiles().stream().
973+
map(Path::of).
974+
sorted(FILEPATH_COMPARATOR).
975+
collect(Collectors.toList());
976+
LOGGER.log(Level.FINEST, "collected sorted files: {0}", paths);
977+
for (Path path : paths) {
935978
if (isInterrupted()) {
936979
return;
937980
}
938-
File file = new File(sourceRoot, path);
939-
processFileHistoryBased(args, file, path);
981+
File file = new File(sourceRoot, path.toString());
982+
processFileHistoryBased(args, file, path.toString());
940983
progress.increment();
941984
}
942985
}
@@ -1096,16 +1139,17 @@ private void removeAnnotationFile(String path) {
10961139
* and queue the removal of the associated xref file.
10971140
*
10981141
* @param removeHistory if false, do not remove history cache for this file
1142+
* @return deleted uid (as string)
10991143
* @throws java.io.IOException if an error occurs
11001144
*/
1101-
private void removeFile(boolean removeHistory) throws IOException {
1145+
private String removeFile(boolean removeHistory) throws IOException {
11021146
String path = Util.uid2url(uidIter.term().utf8ToString());
11031147

11041148
for (IndexChangedListener listener : listeners) {
11051149
listener.fileRemove(path);
11061150
}
11071151

1108-
removeFileDocUid(path);
1152+
String deletedUid = removeFileDocUid(path);
11091153

11101154
removeXrefFile(path);
11111155

@@ -1122,9 +1166,11 @@ private void removeFile(boolean removeHistory) throws IOException {
11221166
for (IndexChangedListener listener : listeners) {
11231167
listener.fileRemoved(path);
11241168
}
1169+
1170+
return deletedUid;
11251171
}
11261172

1127-
private void removeFileDocUid(String path) throws IOException {
1173+
private String removeFileDocUid(String path) throws IOException {
11281174

11291175
// Determine if a reversal of counts is necessary, and execute if so.
11301176
if (isCountingDeltas) {
@@ -1141,6 +1187,8 @@ private void removeFileDocUid(String path) throws IOException {
11411187
}
11421188

11431189
writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
1190+
1191+
return uidIter.term().utf8ToString();
11441192
}
11451193

11461194
private void decrementLOCforDoc(String path, Document doc) {
@@ -1648,6 +1696,17 @@ void indexDown(File dir, String parent, IndexDownArgs args, Progress progress) t
16481696
}
16491697
}
16501698

1699+
/**
1700+
* wrapper for fatal errors during indexing.
1701+
*/
1702+
public static class IndexerFault extends RuntimeException {
1703+
private static final long serialVersionUID = -1;
1704+
1705+
public IndexerFault(String message) {
1706+
super(message);
1707+
}
1708+
}
1709+
16511710
/**
16521711
* Compared with {@link #processFile(IndexDownArgs, File, String)}, this method's file/path arguments
16531712
* represent files that have actually changed in some way, while the other method's argument represent
@@ -1660,12 +1719,14 @@ void indexDown(File dir, String parent, IndexDownArgs args, Progress progress) t
16601719
@VisibleForTesting
16611720
void processFileHistoryBased(IndexDownArgs args, File file, String path) throws IOException {
16621721
final boolean fileExists = file.exists();
1663-
1722+
final Set<String> deletedUidsHere = new HashSet<>();
16641723
path = Util.fixPathIfWindows(path);
1724+
16651725
// Traverse terms until reaching document beyond path of given file.
1666-
while (uidIter != null && uidIter.term() != null
1667-
&& uidIter.term().compareTo(emptyBR) != 0
1668-
&& Util.uid2url(uidIter.term().utf8ToString()).compareTo(path) <= 0) {
1726+
while (uidIter != null && uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0
1727+
&& FILEPATH_COMPARATOR.compare(
1728+
Path.of(Util.uid2url(uidIter.term().utf8ToString())),
1729+
Path.of(path)) <= 0) {
16691730

16701731
if (deletedUids.contains(uidIter.term().utf8ToString())) {
16711732
logIgnoredUid(uidIter.term().utf8ToString());
@@ -1688,9 +1749,10 @@ void processFileHistoryBased(IndexDownArgs args, File file, String path) throws
16881749
if (!matchOK) {
16891750
removeFile(false);
16901751
addWorkHistoryBased(args, termFile, termPath);
1752+
deletedUidsHere.add(removeFile(false));
16911753
}
16921754
} else {
1693-
removeFile(!fileExists);
1755+
deletedUidsHere.add(removeFile(!fileExists));
16941756
}
16951757

16961758
BytesRef next = uidIter.next();
@@ -1703,6 +1765,18 @@ void processFileHistoryBased(IndexDownArgs args, File file, String path) throws
17031765
// That said, it is necessary to check whether the file can be accepted. This is done in the function below.
17041766
// Also, allow for broken symbolic links (File.exists() returns false for these).
17051767
if (fileExists || Files.isSymbolicLink(file.toPath())) {
1768+
// This assumes that the last modified time is indeed what the indexer uses when adding the document.
1769+
String time = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND);
1770+
if (deletedUidsHere.contains(Util.path2uid(path, time))) {
1771+
//
1772+
// Adding document with the same date of a pre-existing document which is being removed
1773+
// will lead to index corruption (duplicate documents). Hence, make the indexer to fail hard.
1774+
//
1775+
throw new IndexerFault(
1776+
String.format("attempting to add file '%s' with date matching deleted document: %s",
1777+
path, time));
1778+
}
1779+
17061780
addWorkHistoryBased(args, file, path);
17071781
}
17081782
}

opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,12 @@ public static void main(String[] argv) {
287287
System.exit(2);
288288
} catch (IndexCheckException e) {
289289
System.err.printf("Index check failed%n");
290-
System.err.print("You might want to remove " + e.getFailedPaths());
290+
if (!e.getFailedPaths().isEmpty()) {
291+
System.err.print("You might want to remove " + e.getFailedPaths());
292+
} else {
293+
System.err.println("with exception: " + e);
294+
e.printStackTrace(System.err);
295+
}
291296
System.exit(1);
292297
}
293298

opengrok-indexer/src/main/java/org/opengrok/indexer/web/Util.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,7 @@ private static void writeAnnotation(int num, Writer out, Annotation annotation,
823823
* walk of the file hierarchy. Thus, null character (\u0000) is used both to
824824
* separate directory components and to separate the path from the date.
825825
*
826-
* @param path path to mangle.
826+
* @param path path to mangle. This is assumed to be relative to source root.
827827
* @param date date string to use.
828828
* @return the mangled path.
829829
*/

0 commit comments

Comments
 (0)