@@ -298,14 +298,13 @@ def detect(self,
298
298
non_holder_labels_mini = frozenset ([
299
299
'COPY' ,
300
300
'YR-RANGE' , 'YR-AND' , 'YR' , 'YR-PLUS' , 'BARE-YR' ,
301
- 'HOLDER' , 'AUTHOR' ,
302
301
'IS' , 'HELD' ,
303
302
])
304
303
305
304
non_authors_labels = frozenset ([
306
305
'COPY' ,
307
306
'YR-RANGE' , 'YR-AND' , 'YR' , 'YR-PLUS' , 'BARE-YR' ,
308
- 'HOLDER ' , 'AUTHOR ' ,
307
+ 'AUTH ' , 'AUTH2' , 'HOLDER ' ,
309
308
'IS' , 'HELD' ,
310
309
])
311
310
@@ -322,10 +321,9 @@ def detect(self,
322
321
copyrght = build_detection_from_node (
323
322
node = tree_node ,
324
323
cls = CopyrightDetection ,
325
- ignores = non_copyright_labels ,
324
+ ignored_labels = non_copyright_labels ,
326
325
include_copyright_allrights = include_copyright_allrights ,
327
326
refiner = refine_copyright ,
328
- junk = COPYRIGHTS_JUNK ,
329
327
)
330
328
331
329
if TRACE or TRACE_DEEP :
@@ -340,7 +338,7 @@ def detect(self,
340
338
holder = build_detection_from_node (
341
339
node = tree_node ,
342
340
cls = HolderDetection ,
343
- ignores = non_holder_labels ,
341
+ ignored_labels = non_holder_labels ,
344
342
refiner = refine_holder ,
345
343
)
346
344
@@ -351,7 +349,7 @@ def detect(self,
351
349
holder = build_detection_from_node (
352
350
node = tree_node ,
353
351
cls = HolderDetection ,
354
- ignores = non_holder_labels_mini ,
352
+ ignored_labels = non_holder_labels_mini ,
355
353
refiner = refine_holder ,
356
354
)
357
355
@@ -365,9 +363,8 @@ def detect(self,
365
363
author = build_detection_from_node (
366
364
node = tree_node ,
367
365
cls = AuthorDetection ,
368
- ignores = non_authors_labels ,
366
+ ignored_labels = non_authors_labels ,
369
367
refiner = refine_author ,
370
- junk = AUTHORS_JUNK ,
371
368
)
372
369
373
370
if author :
@@ -385,15 +382,21 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
385
382
We perform a simple tokenization on spaces, tabs and some punctuation: =;
386
383
"""
387
384
for start_line , line in numbered_lines :
385
+ pos = 0
386
+
388
387
if TRACE_TOK :
389
388
logger_debug (' get_tokens: bare line: ' + repr (line ))
390
389
390
+ # if not line.strip():
391
+ # yield Token(value="\n", label="EMPTY_LINE", start_line=start_line, pos=pos)
392
+ # pos += 1
393
+ # continue
394
+
391
395
line = prepare_text_line (line )
392
396
393
397
if TRACE_TOK :
394
398
logger_debug (' get_tokens: preped line: ' + repr (line ))
395
399
396
- pos = 0
397
400
for tok in splitter (line ):
398
401
# strip trailing quotes+comma
399
402
if tok .endswith ("'," ):
@@ -406,7 +409,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
406
409
.strip ()
407
410
)
408
411
409
- # the tokenizer allows a sinble colon or dot to be atoken and we discard these
412
+ # the tokenizer allows a single colon or dot to be a token and we discard these
410
413
if tok and tok not in ':.' :
411
414
yield Token (value = tok , start_line = start_line , pos = pos )
412
415
pos += 1
@@ -475,42 +478,45 @@ class AuthorDetection(Detection):
475
478
end_line = attr .ib ()
476
479
477
480
481
+ def filter_tokens (node , ignored_labels = frozenset ()):
482
+ """
483
+ Yield tokens for this parse tree Tree, ignoring nodes with a label in the ``ignored_labels`` set.
484
+ The order reflects the order of the leaves in the tree's hierarchical structure, breadth-first.
485
+ """
486
+ for token in node :
487
+ if token .label in ignored_labels :
488
+ continue
489
+ if isinstance (token , Tree ):
490
+ yield from filter_tokens (token , ignored_labels = ignored_labels )
491
+ else :
492
+ yield token
493
+
494
+
478
495
def build_detection_from_node (
479
496
node ,
480
497
cls ,
481
- ignores = frozenset (),
498
+ ignored_labels = frozenset (),
482
499
include_copyright_allrights = False ,
483
500
refiner = None ,
484
- junk = frozenset (),
485
- junk_patterns = frozenset (),
486
501
):
487
502
"""
488
503
Return a ``cls`` Detection object from a pygmars.tree.Tree ``node`` with a
489
504
space-normalized string value or None.
490
505
491
- Filter ``node`` Tokens with a type found in the ``ignores `` set of ignorable
506
+ Filter ``node`` Tokens with a type found in the ``ignored_labels `` set of ignorable
492
507
token types.
493
508
494
509
For copyright detection, include trailing "All rights reserved" if
495
510
``include_copyright_allrights`` is True.
496
511
497
512
Apply the ``refiner`` callable function to the detection string.
498
-
499
- Return None if the value exists in the ``junk`` strings set or is matched by
500
- any of the regex in the ``junk_patterns`` set.
501
513
"""
502
514
include_copyright_allrights = (
503
515
cls == CopyrightDetection
504
516
and include_copyright_allrights
505
517
)
506
518
507
- if ignores :
508
- leaves = [
509
- token for token in node .leaves ()
510
- if token .label not in ignores
511
- ]
512
- else :
513
- leaves = node .leaves ()
519
+ leaves = list (filter_tokens (node , ignored_labels = ignored_labels ))
514
520
515
521
if include_copyright_allrights :
516
522
filtered = leaves
@@ -545,7 +551,7 @@ def build_detection_from_node(
545
551
if refiner :
546
552
node_string = refiner (node_string )
547
553
548
- if node_string and not is_junk_copyryright (node_string ):
554
+ if node_string and not is_junk_copyright (node_string ):
549
555
start_line = filtered [0 ].start_line
550
556
end_line = filtered [- 1 ].start_line
551
557
@@ -1370,6 +1376,8 @@ def build_detection_from_node(
1370
1376
(r'^Bugfixes?$' , 'NN' ),
1371
1377
(r'^Likes?$' , 'NN' ),
1372
1378
(r'^STA$' , 'NN' ),
1379
+ (r'^Page$' , 'NN' ),
1380
+ (r'^Todo/Under$' , 'JUNK' ),
1373
1381
1374
1382
(r'^Interrupt$' , 'NN' ),
1375
1383
(r'^cleanups?$' , 'JUNK' ),
@@ -2071,7 +2079,7 @@ def build_detection_from_node(
2071
2079
(r'^\$?date-of-document$' , 'YR' ),
2072
2080
2073
2081
# cardinal numbers
2074
- (r'^-?[0-9]+(.[0-9]+)?\. ?$' , 'CD' ),
2082
+ (r'^-?[0-9]+(.[0-9]+)?[\.,] ?$' , 'CD' ),
2075
2083
2076
2084
############################################################################
2077
2085
# All caps and proper nouns
@@ -2239,6 +2247,8 @@ def build_detection_from_node(
2239
2247
YR-RANGE: {<YR-AND>+} #70
2240
2248
YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
2241
2249
YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
2250
+ # Copyright (c) 1999, 2000, 01, 03, 06 Ralf Baechle
2251
+ YR-RANGE: {<YR-RANGE> <CD>+} #72.2
2242
2252
2243
2253
CD: {<BARE-YR>} #bareyear
2244
2254
@@ -3178,7 +3188,7 @@ def build_detection_from_node(
3178
3188
# the Initial Developer. All Rights Reserved.
3179
3189
COPYRIGHT: {<PORTIONS> <AUTH2> <INITIALDEV> <IS> <COPY|COPYRIGHT2>+ <YR-RANGE>? <INITIALDEV>} #2609.1
3180
3190
3181
- # Portions created by the Initial Developer are Copyright (C)
3191
+ # Portions created by the Initial Developer are Copyright (C)
3182
3192
# the Initial Developer. All Rights Reserved.
3183
3193
# and
3184
3194
# Portions created by the Initial Developer are Copyright (C) 2002
@@ -3576,7 +3586,7 @@ def refine_names(s, prefixes):
3576
3586
COPYRIGHTS_JUNK_PATTERN_MATCHERS = [re .compile (p , re .IGNORECASE ).match for p in COPYRIGHTS_JUNK ]
3577
3587
3578
3588
3579
- def is_junk_copyryright (s , patterns = COPYRIGHTS_JUNK_PATTERN_MATCHERS ):
3589
+ def is_junk_copyright (s , patterns = COPYRIGHTS_JUNK_PATTERN_MATCHERS ):
3580
3590
"""
3581
3591
Return True if the string ``s`` matches any junk patterns.
3582
3592
"""
0 commit comments