Skip to content

Commit a3cb935

Browse files
committed
Make parsing 7z listing work on Linux
Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 36d82cb commit a3cb935

File tree

43 files changed

+4838
-1108
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+4838
-1108
lines changed

src/extractcode/sevenzip.py

Lines changed: 118 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,17 @@
5454
else:
5555
from pipes import quote as shlex_quote
5656

57-
5857
"""
5958
Low level support for p/7zip-based archive extraction.
6059
"""
6160

62-
6361
logger = logging.getLogger(__name__)
6462

6563
TRACE = False
6664
TRACE_DEEP = False
65+
TRACE_ENTRIES = False
6766

68-
if TRACE or TRACE_DEEP:
67+
if TRACE or TRACE_DEEP or TRACE_ENTRIES:
6968
import sys
7069
logging.basicConfig(stream=sys.stdout)
7170
logger.setLevel(logging.DEBUG)
@@ -518,45 +517,6 @@ def list_entries(location, arch_type='*'):
518517
return parse_7z_listing(stdout, utf), error_messages
519518

520519

521-
def as_entry(infos):
522-
"""
523-
Return an Entry built from a 7zip path listing data in the `infos` mapping.
524-
"""
525-
is_symlink = False
526-
is_hardlink = False
527-
link_target = None
528-
529-
sl = infos.get('Symbolic Link')
530-
531-
if sl:
532-
is_symlink = True
533-
link_target = sl
534-
535-
hl = infos.get('Hard Link')
536-
if hl:
537-
is_hardlink = True
538-
link_target = hl
539-
540-
if sl and hl:
541-
from pprint import pformat
542-
raise ExtractWarningIncorrectEntry(
543-
'A symlink cannot be also a hardlink: {}'.format(pformat(infos)))
544-
545-
is_dir = infos.get('Folder', False) == '+'
546-
547-
e = Entry(
548-
path=infos.get('Path'),
549-
size=infos.get('Size', 0),
550-
date=infos.get('Modified', None),
551-
is_dir=is_dir,
552-
is_file=not is_dir,
553-
is_symlink=is_symlink,
554-
is_hardlink=is_hardlink,
555-
link_target=link_target,
556-
)
557-
return e
558-
559-
560520
def parse_7z_listing(location, utf=False):
561521
"""
562522
Return a list Entry objects from parsing a long format 7zip listing from a
@@ -567,16 +527,18 @@ def parse_7z_listing(location, utf=False):
567527
568528
The 7zip -slt format looks like this:
569529
530+
1. a header with:
570531
- copyright and version details
571532
- '--' line
572533
- archive header info, varying based on the archive types and subtype
573534
- lines of key=value pairs
574-
- Errors: followed by one or more message lines
575-
- Warnings: followed by one or more message lines
576-
- Open Warning: : followed by one or more message lines
577-
- sometimes a '---' line
535+
- ERRORS: followed by one or more message lines
536+
- WARNINGS: followed by one or more message lines
578537
- blank line
579-
- '----------' line
538+
539+
2. blocks of path aka. entry data, one for each path with:
540+
541+
- '----------' line once as the indicator of path blocks starting
580542
- for each archive member:
581543
- lines of either
582544
- key = value pairs, with a possible twist that the Path may
@@ -585,162 +547,94 @@ def parse_7z_listing(location, utf=False):
585547
- Warnings: followed by one or more message lines
586548
- Open Warning: : followed by one or more message lines
587549
- blank line
588-
- two blank lines
550+
551+
3. a footer
552+
- blank line
589553
- footer sometimes with lines with summary stats
590554
such as Warnings: 1 Errors: 1
591555
- a line with two or more dashes or an empty line
556+
557+
We ignore the header and footer in a listing.
592558
"""
593559

594560
if utf or py3:
595561
# read to unicode
596562
with io.open(location, 'r', encoding='utf-8') as listing:
597563
text = listing.read()
598-
if TRACE_DEEP:
599-
print('=====================================================')
600-
print(text)
601-
print('=====================================================')
602-
603564
text = text.replace(u'\r\n', u'\n')
604565

605-
header_sep = u'\n----------\n'
606-
empty = u''
607-
body_sep = u'\n\n\n'
608-
path_block_sep = u'Path ='
609-
msg_sep = u':'
610-
equal_sep = u'='
611-
errror_line_starters = 'Open Warning:', 'Errors:', 'Warnings:'
612-
line_sep = u'\n'
566+
end_of_header = u'----------\n'
567+
path_key = u'Path'
568+
kv_sep = u'='
569+
path_blocks_sep = u'\n\n'
570+
line_sep = u'\n'
613571

614572
else:
615573
# read to bytes
616574
with io.open(location, 'rb') as listing:
617575
text = listing.read()
618576
text = text.replace(b'\r\n', b'\n')
619577

620-
header_sep = b'\n----------\n'
621-
empty = b''
622-
body_sep = b'\n\n\n'
623-
path_block_sep = b'Path ='
624-
msg_sep = b':'
625-
equal_sep = b'='
626-
errror_line_starters = b'Open Warning:', b'Errors:', b'Warnings:'
627-
line_sep = b'\n'
578+
end_of_header = b'----------\n'
579+
path_key = b'Path'
580+
kv_sep = b'='
581+
path_blocks_sep = b'\n\n'
582+
line_sep = b'\n'
628583

629584
if TRACE:
630585
logger.debug('parse_7z_listing: initial text: type: ' + repr(type(text)))
631586
print('--------------------------------------')
632587
print(text)
633588
print('--------------------------------------')
634589

635-
header_tail = re.split(header_sep, text, flags=re.MULTILINE) # NOQA
636-
if len(header_tail) != 2:
637-
# we more than one a header, confusion entails.
638-
raise ExtractWarningIncorrectEntry(
639-
'Incorrect 7zip listing with multiple headers: {}'.format(repr(header_tail)))
590+
# for now we ignore the header
591+
_header, _, paths = text.rpartition(end_of_header)
640592

641-
if len(header_tail) == 1:
593+
if not paths:
642594
# we have only a header, likely an error condition or an empty archive
643595
return []
644596

645-
# FIXME: do something with header and footer?
646-
_header, body = header_tail
647-
body_and_footer = re.split(body_sep, body, flags=re.MULTILINE) # NOQA
648-
no_footer = len(body_and_footer) == 1
649-
multiple_footers = len(body_and_footer) > 2
650-
_footer = empty
651-
652-
if no_footer:
653-
body = body_and_footer[0]
654-
elif multiple_footers:
655-
raise ExtractWarningIncorrectEntry(
656-
'Incorrect 7zip listing with multiple footers: {}'.format(repr(body_and_footer)))
657-
else:
658-
body, _footer == body_and_footer
659-
660-
entries = []
597+
# each block representing one path or file:
598+
# - starts with a "Path = <some/path>" key/value
599+
# - continues with key = value pairs each on a single line
600+
# (unless there is a \n in file name which is an error condition)
601+
# - ends with an empty line
602+
# then we have a global footer
661603

662-
if TRACE:
663-
logger.debug('parse_7z_listing: body:')
664-
print(body)
604+
path_blocks = [pb for pb in paths.split(path_blocks_sep) if pb and path_key in pb]
665605

666-
path_blocks = [pb.strip() for pb in
667-
re.split(path_block_sep, body, flags=re.MULTILINE) if pb and pb.strip()] # NOQA
668-
669-
if TRACE_DEEP:
670-
logger.debug('parse_7z_listing: path_blocks:')
671-
pprint.pprint(path_blocks)
606+
entries = []
672607

673608
for path_block in path_blocks:
674-
if TRACE:
675-
logger.debug('parse_7z_listing: path_block: {}'.format(path_block))
676-
677-
errors = []
678-
infos = {}
679-
680-
lines = path_block.splitlines(False)
681-
682-
if len(lines) == 1:
683-
# a temp macOS debug statement
684-
raise Exception(text)
685-
686-
# the first line is the Path line
687-
path_line = lines.pop(0).strip()
688-
if 'Path =' in path_line:
689-
_, _, path = path_line.partition('Path =')
690-
path = path.lstrip()
691-
else:
692-
path = path_line
693-
694-
second = lines[0]
695-
696-
if equal_sep not in second:
697-
# the path contain line breaks and the next line continues the name
698-
path = line_sep.join([path, second])
699-
lines.pop(0)
700-
701-
infos['Path'] = path
702-
703-
is_err = False
704-
705-
# process the remainining non-path lines
706-
for line in lines:
707-
if TRACE_DEEP:
708-
logger.debug('parse_7z_listing: line: "{}"'.format(line))
709-
710-
line = line.strip()
711-
712-
if not line:
713-
continue
714-
715-
if line.startswith(errror_line_starters):
716-
is_err = True
717-
messages = line.split(msg_sep, 1)
718-
errors.append(messages)
719-
continue
720-
721-
if equal_sep not in line and is_err:
722-
# not a key = value line, an error message
723-
errors.append(line)
724-
continue
725-
726-
parts = line.split(equal_sep, 1)
727-
728-
if len(parts) != 2:
729-
raise ExtractWarningIncorrectEntry(
730-
'Incorrect 7zip listing line with no key=value: {}'.format(repr(line)))
731-
732-
is_err = False
733-
key, value = parts
734-
key = key.strip()
735-
value = value.strip()
736-
assert key not in infos, 'Duplicate keys in 7zip listing'
737-
infos[key] = value or empty
738-
739-
if infos:
740-
entr = as_entry(infos)
741-
entries.append(entr)
742-
743-
if TRACE_DEEP:
609+
# we ignore empty lines as well as lines that do not contain a key
610+
lines = [line.strip() for line in path_block.splitlines(False) if line.strip()]
611+
if not lines:
612+
continue
613+
# we have a weird case of path with line returns in the file name
614+
# we concatenate these in the first Path line
615+
while len(lines) > 1 and lines[0].startswith(path_key) and kv_sep not in lines[1]:
616+
first_line = lines[0]
617+
second_line = lines.pop(1)
618+
first_line = line_sep.join([first_line, second_line])
619+
lines[0] = first_line
620+
621+
dangling_lines = [line for line in lines if kv_sep not in line]
622+
entry_errors = []
623+
if dangling_lines:
624+
emsg = 'Invalid 7z listing path block missing "=" as key/value separator: {}'.format(repr(path_block))
625+
entry_errors.append(emsg)
626+
627+
entry_attributes = {}
628+
key_lines = [line for line in lines if kv_sep in line]
629+
for line in key_lines:
630+
k, _, v = line.partition(kv_sep)
631+
k = k.strip()
632+
v = v.strip()
633+
entry_attributes[k] = v
634+
635+
entries.append(Entry.from_dict(infos=entry_attributes, errors=entry_errors))
636+
637+
if TRACE_ENTRIES:
744638
logger.debug('parse_7z_listing: entries# {}\n'.format(len(entries)))
745639
for entry in entries:
746640
logger.debug(' ' + repr(entry.to_dict()))
@@ -777,12 +671,9 @@ class Entry(object):
777671
link_target = attr.ib(default=None)
778672
errors = attr.ib(default=attr.Factory(list))
779673

780-
def parent(self):
781-
return posixpath.dirname(self.path.rstrip('/'))
782-
783674
def to_dict(self, full=False):
784675
data = attr.asdict(self)
785-
data.pop('errors', None)
676+
#data.pop('errors', None)
786677
if not full:
787678
data.pop('date', None)
788679
return data
@@ -795,3 +686,56 @@ def is_relative_path(self):
795686

796687
def is_empty(self):
797688
return not self.size
689+
690+
@classmethod
691+
def from_dict(cls, infos, errors=None):
692+
"""
693+
Return an Entry built from a 7zip path listing data in the `infos` mapping.
694+
"""
695+
is_symlink = False
696+
is_hardlink = False
697+
link_target = None
698+
699+
sl = infos.get('Symbolic Link')
700+
701+
if sl:
702+
is_symlink = True
703+
link_target = sl
704+
705+
hl = infos.get('Hard Link')
706+
if hl:
707+
is_hardlink = True
708+
link_target = hl
709+
710+
if sl and hl:
711+
from pprint import pformat
712+
raise ExtractWarningIncorrectEntry(
713+
'A symlink cannot be also a hardlink: {}'.format(pformat(infos)))
714+
715+
# depending on the type of arhcive the file vs dir flags are in
716+
# diiferent attributes :|
717+
is_dir = (
718+
# in some listings we have this: Mode = drwxrwxr-x
719+
infos.get('Mode', '').lower().startswith('d')
720+
or
721+
# in cpio and a few more we have a Folder attrib
722+
infos.get('Folder', '').startswith('+')
723+
or
724+
# in 7z listing we have this: Attributes = D_ drwxrwxr-x
725+
infos.get('Attributes', '').lower().startswith('d_')
726+
) or False
727+
728+
is_file = not is_dir
729+
730+
e = cls(
731+
path=infos.get('Path'),
732+
size=infos.get('Size', 0),
733+
date=infos.get('Modified', None),
734+
is_dir=is_dir,
735+
is_file=is_file,
736+
is_symlink=is_symlink,
737+
is_hardlink=is_hardlink,
738+
link_target=link_target,
739+
errors=errors or [],
740+
)
741+
return e

0 commit comments

Comments
 (0)