From e0b4c6e8ce936077079369ed472359b4818e6e6e Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Wed, 9 Apr 2025 16:42:08 +0530 Subject: [PATCH 1/4] split token on colon and remove leading plus sign Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/4229 Signed-off-by: Alok Kumar --- src/cluecode/copyrights.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 190ff2082a..697402e8ea 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -437,6 +437,23 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split): .strip() ) + # remove leading plus sign + if tok.startswith('+'): + tok = tok.lstrip('+') + # convert 'AUTHOR' to ('author' or 'Author') + if tok == 'AUTHOR': + tok = 'author' + + # Split tokens like 'Author:Frankie.Chu' into 'Author' and 'Frankie.Chu' + if tok.startswith("Author:"): + parts = tok.split(":", 1) + for part in parts: + part = part.strip() + if part and part not in ':.': + yield Token(value=part, start_line=start_line, pos=pos) + pos += 1 + continue + # the tokenizer allows a single colon or dot to be a token and we discard these if tok and tok not in ':.': yield Token(value=tok, start_line=start_line, pos=pos) From 7401e720270bc3a3a339efcbc9c104f60852e920 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Sun, 13 Apr 2025 03:14:25 +0530 Subject: [PATCH 2/4] fix test failure no need to remove single plus sign Signed-off-by: Alok Kumar --- src/cluecode/copyrights.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 697402e8ea..31981f9a76 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -438,7 +438,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split): ) # remove leading plus sign - if tok.startswith('+'): + if tok.startswith('+') and len(tok) > 1: tok = tok.lstrip('+') # convert 'AUTHOR' to ('author' or 'Author') if tok == 'AUTHOR': From 75821d2d6ba6911dc616f270b1101cf166bb9ddb Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Sun, 13 Apr 2025 22:15:29 +0530 Subject: [PATCH 3/4] add support for single token If any single word whose first letter is capital and also having dot(.) between word then consider as NNP. Signed-off-by: Alok Kumar --- src/cluecode/copyrights.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 31981f9a76..02ed1d65e2 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -1810,6 +1810,9 @@ def build_detection_from_node( r'[A-Z][a-z]+[\.,]?' r')$', 'NNP'), + # Matches a capitalized word with a dot (Like Frankie.Chu). + (r'^[A-Z][a-zA-Z]*\.[a-zA-Z]+$', 'NNP'), + # cmmunications (r'communications', 'NNP'), @@ -3495,6 +3498,9 @@ def build_detection_from_node( # developed by Atkinson, et al. AUTHOR: { + } #Atkinson, et al. + # Author:Frankie.Chu + AUTHOR: { } + ####################################### # Mixed AUTHOR and COPYRIGHT From e104affdd8c01770dbfcd0b65825a10d061e29c6 Mon Sep 17 00:00:00 2001 From: Alok Kumar Date: Sun, 13 Apr 2025 22:46:47 +0530 Subject: [PATCH 4/4] fix text failure Signed-off-by: Alok Kumar --- src/cluecode/copyrights.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 02ed1d65e2..ebaf283dfa 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -1810,9 +1810,6 @@ def build_detection_from_node( r'[A-Z][a-z]+[\.,]?' r')$', 'NNP'), - # Matches a capitalized word with a dot (Like Frankie.Chu). - (r'^[A-Z][a-zA-Z]*\.[a-zA-Z]+$', 'NNP'), - # cmmunications (r'communications', 'NNP'), @@ -3498,10 +3495,6 @@ def build_detection_from_node( # developed by Atkinson, et al. AUTHOR: { + } #Atkinson, et al. - # Author:Frankie.Chu - AUTHOR: { } - - ####################################### # Mixed AUTHOR and COPYRIGHT #######################################