[clang][scandeps] Improve handling of rawstrings. (llvm#139504)

tru · rlavaee · commit 8512c6447585 · 2025-07-01T15:44:54.000Z
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
@@ -585,6 +585,11 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
+  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
+  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
+  /// to this function.
+  static unsigned getEscapedNewLineSize(const char *P);
+
   /// Diagnose use of a delimited or named escape sequence.
   static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc,
                                                      bool Named,
@@ -725,11 +730,6 @@ class Lexer : public PreprocessorLexer {
   /// method.
   SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
 
-  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
-  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
-  /// to this function.
-  static unsigned getEscapedNewLineSize(const char *P);
-
   /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
   /// them), skip over them and return the first non-escaped-newline found,
   /// otherwise return P.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -206,6 +206,30 @@ static void skipOverSpaces(const char *&First, const char *const End) {
     ++First;
 }
 
+// Move back by one character, skipping escaped newlines (backslash + \n)
+static char previousChar(const char *First, const char *&Current) {
+  assert(Current > First);
+  --Current;
+  while (Current > First && isVerticalWhitespace(*Current)) {
+    // Check if the previous character is a backslash
+    if (Current > First && *(Current - 1) == '\\') {
+      // Use Lexer's getEscapedNewLineSize to get the size of the escaped
+      // newline
+      unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current);
+      if (EscapeSize > 0) {
+        // Skip back over the entire escaped newline sequence (backslash +
+        // newline)
+        Current -= (1 + EscapeSize);
+      } else {
+        break;
+      }
+    } else {
+      break;
+    }
+  }
+  return *Current;
+}
+
 [[nodiscard]] static bool isRawStringLiteral(const char *First,
                                              const char *Current) {
   assert(First <= Current);
@@ -215,25 +239,27 @@ static void skipOverSpaces(const char *&First, const char *const End) {
     return false;
 
   // Check for an "R".
-  --Current;
-  if (*Current != 'R')
+  if (previousChar(First, Current) != 'R')
     return false;
-  if (First == Current || !isAsciiIdentifierContinue(*--Current))
+  if (First == Current ||
+      !isAsciiIdentifierContinue(previousChar(First, Current)))
     return true;
 
   // Check for a prefix of "u", "U", or "L".
   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
-    return First == Current || !isAsciiIdentifierContinue(*--Current);
+    return First == Current ||
+           !isAsciiIdentifierContinue(previousChar(First, Current));
 
   // Check for a prefix of "u8".
-  if (*Current != '8' || First == Current || *Current-- != 'u')
+  if (*Current != '8' || First == Current ||
+      previousChar(First, Current) != 'u')
     return false;
-  return First == Current || !isAsciiIdentifierContinue(*--Current);
+  return First == Current ||
+         !isAsciiIdentifierContinue(previousChar(First, Current));
 }
 
 static void skipRawString(const char *&First, const char *const End) {
   assert(First[0] == '"');
-  assert(First[-1] == 'R');
 
   const char *Last = ++First;
   while (Last != End && *Last != '(')
@@ -416,6 +442,14 @@ void Scanner::skipLine(const char *&First, const char *const End) {
         continue;
       }
 
+      // Continue on the same line if an EOL is preceded with backslash
+      if (First + 1 < End && *First == '\\') {
+        if (unsigned Len = isEOL(First + 1, End)) {
+          First += 1 + Len;
+          continue;
+        }
+      }
+
       // Iterate over comments correctly.
       if (*First != '/' || End - First < 2) {
         LastTokenPtr = First;
diff --git a/clang/test/ClangScanDeps/raw-strings.cpp b/clang/test/ClangScanDeps/raw-strings.cpp
@@ -0,0 +1,148 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json
+
+//--- cdb.json.in
+[{
+    "directory": "DIR",
+    "command": "clang -c DIR/tu.c -o DIR/tu.o -IDIR/include",
+    "file": "DIR/tu.c"
+}]
+//--- include/header.h
+//--- include/header2.h
+//--- include/header3.h
+//--- include/header4.h
+//--- include/header5.h
+//--- include/header6.h
+//--- include/header7.h
+//--- include/header8.h
+//--- include/header9.h
+//--- include/header10.h
+//--- include/header11.h
+//--- include/header12.h
+//--- include/header13.h
+//--- include/header14.h
+//--- tu.c
+#if 0
+R"x()x"
+#endif
+
+#include "header.h"
+
+#if 0
+R"y(";
+#endif
+#include "header2.h"
+
+#if 0
+//")y"
+#endif
+
+#if 0
+R"y(";
+R"z()y";
+#endif
+#include "header3.h"
+#if 0
+//")z"
+#endif
+
+#if 0
+R\
+"y(";
+R"z()y";
+#endif
+#include "header4.h"
+#if 0
+//")z"
+#endif
+
+// Test u8 prefix with escaped newline
+#if 0
+u8R\
+"prefix(test)prefix"
+#endif
+#include "header5.h"
+
+// Test u prefix with multiple escaped newlines
+#if 0
+uR\
+\
+"multi(test)multi"
+#endif
+#include "header6.h"
+
+// Test U prefix with escaped newline
+#if 0
+UR\
+"upper(test)upper"
+#endif
+#include "header7.h"
+
+// Test L prefix with escaped newline
+#if 0
+LR\
+"wide(test)wide"
+#endif
+#include "header8.h"
+
+// Test escaped newline with \r\n style
+#if 0
+R\
+"crlf(test)crlf"
+#endif
+#include "header9.h"
+
+// Test multiple escaped newlines in different positions
+#if 0
+u\
+8\
+R\
+"complex(test)complex"
+#endif
+#include "header10.h"
+
+// Test raw string that should NOT be treated as raw (no R prefix due to identifier continuation)
+#if 0
+identifierR"notraw(test)notraw"
+#endif
+#include "header11.h"
+
+// Test raw string with whitespace before escaped newline
+#if 0
+R \
+"whitespace(test)whitespace"
+#endif
+#include "header12.h"
+
+// Test nested raw strings in disabled code
+#if 0
+R"outer(
+    R"inner(content)inner"
+)outer"
+#endif
+#include "header13.h"
+
+// Test raw string with empty delimiter
+#if 0
+R\
+"(empty delimiter)";
+#endif
+#include "header14.h"
+
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess | FileCheck %s
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -mode preprocess-dependency-directives | FileCheck %s
+// CHECK: tu.c
+// CHECK-NEXT: header.h
+// CHECK-NEXT: header3.h
+// CHECK-NEXT: header4.h
+// CHECK-NEXT: header5.h
+// CHECK-NEXT: header6.h
+// CHECK-NEXT: header7.h
+// CHECK-NEXT: header8.h
+// CHECK-NEXT: header9.h
+// CHECK-NEXT: header10.h
+// CHECK-NEXT: header11.h
+// CHECK-NEXT: header12.h
+// CHECK-NEXT: header13.h
+// CHECK-NEXT: header14.h