Skip to content

Commit 056b6c1

Browse files
committed
Add new CLI option to support extracting all formats
With this all supported archive formats with be tried. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 371c11e commit 056b6c1

File tree

4 files changed

+131
-28
lines changed

4 files changed

+131
-28
lines changed

src/extractcode/api.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,37 @@
2323
"""
2424

2525

26-
def extract_archives(location, recurse=True, replace_originals=False, ignore_pattern=()):
26+
def extract_archives(
27+
location,
28+
recurse=True,
29+
replace_originals=False,
30+
ignore_pattern=(),
31+
all_formats=False,
32+
):
2733
"""
2834
Yield ExtractEvent while extracting archive(s) and compressed files at
29-
`location`. If `recurse` is True, extract nested archives-in-archives
30-
recursively.
35+
`location`.
36+
37+
If `recurse` is True, extract nested archives-in-archives recursively.
38+
If `all_formats` is True, extract all supported archives formats.
39+
3140
Archives and compressed files are extracted in a directory named
3241
"<file_name>-extract" created in the same directory as the archive.
42+
3343
Note: this API is returning an iterable and NOT a sequence.
3444
"""
45+
3546
from extractcode.extract import extract
3647
from extractcode import default_kinds
48+
from extractcode import all_kinds
49+
50+
kinds = all_kinds if all_formats else default_kinds
51+
3752
for xevent in extract(
3853
location=location,
39-
kinds=default_kinds,
54+
kinds=kinds,
4055
recurse=recurse,
4156
replace_originals=replace_originals,
42-
ignore_pattern=ignore_pattern
57+
ignore_pattern=ignore_pattern,
4358
):
4459
yield xevent

src/extractcode/archive.py

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@
3838
from extractcode import patches
3939
from extractcode import special_package
4040

41+
from extractcode import libarchive2
4142
from extractcode import patch
4243
from extractcode import sevenzip
43-
from extractcode import libarchive2
44+
4445
from extractcode.uncompress import uncompress_gzip
4546
from extractcode.uncompress import uncompress_bzip2
4647

@@ -79,8 +80,19 @@
7980
- http://en.wikipedia.org/wiki/List_of_file_formats#Archive_and_compressed
8081
"""
8182

82-
# if strict, all hanlders criteria must be matched for it to be selected
83-
Handler = namedtuple('Handler', ['name', 'filetypes', 'mimetypes', 'extensions', 'kind', 'extractors', 'strict'])
83+
# if strict, all handlers criteria must be matched for a handler to be selected
84+
Handler = namedtuple(
85+
'Handler',
86+
[
87+
'name',
88+
'filetypes',
89+
'mimetypes',
90+
'extensions',
91+
'kind',
92+
'extractors',
93+
'strict',
94+
]
95+
)
8496

8597

8698
def can_extract(location):
@@ -96,13 +108,17 @@ def can_extract(location):
96108

97109
def should_extract(location, kinds, ignore_pattern=()):
98110
"""
99-
Return True if this location should be extracted based on the provided
100-
kinds
111+
Return True if this location should be extracted based on the provided kinds
101112
"""
102113
location = os.path.abspath(os.path.expanduser(location))
103114
ignore_pattern = {extension : 'User ignore: Supplied by --ignore' for extension in ignore_pattern}
104115
should_ignore = is_ignored(location, ignore_pattern)
105-
if get_extractor(location, kinds) and not should_ignore:
116+
extractor = get_extractor(location, kinds=kinds)
117+
118+
if TRACE_DEEP:
119+
logger.debug(f' should_extract: extractor: {extractor}, should_ignore: {should_ignore}')
120+
121+
if extractor and not should_ignore:
106122
return True
107123

108124

@@ -113,15 +129,19 @@ def get_extractor(location, kinds=all_kinds):
113129
"""
114130
assert location
115131
location = os.path.abspath(os.path.expanduser(location))
116-
extractors = get_extractors(location, kinds)
132+
extractors = get_extractors(location, kinds=kinds)
117133
if not extractors:
134+
if TRACE_DEEP:
135+
logger.debug(f' get_extractor: not extractors: {extractors}')
118136
return None
119137

120138
if len(extractors) == 2:
121139
extractor1, extractor2 = extractors
122-
nested_extractor = functional.partial(extract_twice,
123-
extractor1=extractor1,
124-
extractor2=extractor2)
140+
nested_extractor = functional.partial(
141+
extract_twice,
142+
extractor1=extractor1,
143+
extractor2=extractor2,
144+
)
125145
return nested_extractor
126146
elif len(extractors) == 1:
127147
return extractors[0]
@@ -135,23 +155,38 @@ def get_extractors(location, kinds=all_kinds):
135155
location or an empty list.
136156
"""
137157
handler = get_best_handler(location, kinds)
158+
if TRACE_DEEP:
159+
logger.debug(f' get_extractors: handler: {handler}')
160+
138161
return handler and handler.extractors or []
139162

140163

141164
def get_best_handler(location, kinds=all_kinds):
142165
"""
143-
Return the best handler of None for the file at location.
166+
Return the best handler for the file at `location` or None .
144167
"""
145168
location = os.path.abspath(os.path.expanduser(location))
146169
if not filetype.is_file(location):
147170
return
171+
148172
handlers = list(get_handlers(location))
149173
if TRACE_DEEP:
150-
logger.debug('get_best_handler: handlers: %(handlers)r ' % locals())
174+
logger.debug(f' get_best_handler: handlers: {handlers}')
175+
if not handlers:
176+
return
177+
178+
candidates = list(score_handlers(handlers))
179+
if TRACE_DEEP:
180+
logger.debug(f' get_best_handler: candidates: {candidates}')
181+
if not candidates:
182+
if TRACE_DEEP:
183+
logger.debug(f' get_best_handler: candidates: {candidates}')
184+
return
151185

152-
if handlers:
153-
candidates = score_handlers(handlers)
154-
return candidates and pick_best_handler(candidates, kinds)
186+
picked = pick_best_handler(candidates, kinds=kinds)
187+
if TRACE_DEEP:
188+
logger.debug(f' get_best_handler: picked: {picked}')
189+
return picked
155190

156191

157192
def get_handlers(location):
@@ -177,6 +212,8 @@ def get_handlers(location):
177212

178213
# default to False
179214
type_matched = handler.filetypes and any(t in ftype for t in handler.filetypes)
215+
if TRACE_DEEP:
216+
logger.debug(f' get_handlers: handler.filetypes={handler.filetypes}')
180217
mime_matched = handler.mimetypes and any(m in mtype for m in handler.mimetypes)
181218
exts = handler.extensions
182219
if exts:
@@ -201,10 +238,18 @@ def score_handlers(handlers):
201238
Score candidate handlers. Higher score is better.
202239
"""
203240
for handler, type_matched, mime_matched, extension_matched in handlers:
241+
if TRACE_DEEP:
242+
logger.debug(
243+
f' score_handlers: handler={handler}, '
244+
f'type_matched={type_matched}, '
245+
f'mime_matched={mime_matched}, '
246+
f'extension_matched={extension_matched}'
247+
)
204248
score = 0
205249
# increment kind value: higher kinds numerical values are more
206250
# specific by design
207251
score += handler.kind
252+
if TRACE_DEEP: logger.debug(f' score_handlers: score += handler.kind {score}')
208253

209254
# increment score based on matched criteria
210255
if type_matched and mime_matched and extension_matched:
@@ -255,6 +300,10 @@ def pick_best_handler(candidates, kinds):
255300
"""
256301
# sort by increasing scores
257302
scored = sorted(candidates, reverse=True)
303+
304+
if TRACE_DEEP:
305+
logger.debug(f' pick_best_handler: scored: {scored}')
306+
258307
if not scored:
259308
return
260309

@@ -994,7 +1043,7 @@ def try_to_extract(location, target_dir, extractor):
9941043
strict=False
9951044
)
9961045

997-
PatchHandler = Handler(
1046+
`PatchHandler = Handler(
9981047
name='Patch',
9991048
filetypes=('diff', 'patch',),
10001049
mimetypes=('text/x-diff',),

src/extractcode/cli.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,16 @@ class ExtractCommand(cliutils.BaseCommand):
103103
@click.option('--shallow', is_flag=True, default=False, help='Do not extract recursively nested archives (e.g. not archives in archives).')
104104
@click.option('--replace-originals', is_flag=True, default=False, help='Replace extracted archives by the extracted content.')
105105
@click.option('--ignore', default=[], multiple=True, help='Ignore files/directories following a glob-pattern.')
106+
@click.option('--all-formats', is_flag=True, default=False, help='Extract archives from all known formats.')
106107

107108
@click.help_option('-h', '--help')
108109
@click.option('--about', is_flag=True, is_eager=True, callback=print_about, help='Show information about ExtractCode and licensing and exit.')
109110
@click.option('--version', is_flag=True, is_eager=True, callback=print_version, help='Show the version and exit.')
110-
def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, *args, **kwargs): # NOQA
111+
def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, all_formats, *args, **kwargs): # NOQA
111112
"""extract archives and compressed files found in the <input> file or directory tree.
112113
113114
Archives found inside an extracted archive are extracted recursively.
115+
Use --shallow for a shallow extraction.
114116
Extraction for each archive is done in-place in a new directory named
115117
'<archive file name>-extract' created side-by-side with an archive.
116118
"""
@@ -125,17 +127,26 @@ def extract_event(item):
125127
return ''
126128
if not item:
127129
return ''
130+
128131
source = item.source
129132
if not isinstance(source, str):
130133
source = toascii(source, translit=True).decode('utf-8', 'replace')
134+
131135
if verbose:
132136
if item.done:
133137
return ''
134-
line = source and get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir) or ''
138+
line = source and get_relative_path(
139+
path=source,
140+
len_base_path=len_base_path,
141+
base_is_dir=base_is_dir,
142+
) or ''
143+
135144
else:
136145
line = source and fileutils.file_name(source) or ''
146+
137147
if not isinstance(line, str):
138148
line = toascii(line, translit=True).decode('utf-8', 'replace')
149+
139150
return 'Extracting: %(line)s' % locals()
140151

141152
def display_extract_summary():
@@ -149,11 +160,19 @@ def display_extract_summary():
149160
has_errors = has_errors or bool(xev.errors)
150161
has_warnings = has_warnings or bool(xev.warnings)
151162
source = fileutils.as_posixpath(xev.source)
163+
152164
if not isinstance(source, str):
153165
source = toascii(source, translit=True).decode('utf-8', 'replace')
154-
source = get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir)
166+
167+
source = get_relative_path(
168+
path=source,
169+
len_base_path=len_base_path,
170+
base_is_dir=base_is_dir,
171+
)
172+
155173
for e in xev.errors:
156174
echo_stderr('ERROR extracting: %(source)s: %(e)s' % locals(), fg='red')
175+
157176
for warn in xev.warnings:
158177
echo_stderr('WARNING extracting: %(source)s: %(warn)s' % locals(), fg='yellow')
159178

@@ -174,7 +193,12 @@ def display_extract_summary():
174193
has_extract_errors = False
175194

176195
extractibles = extract_archives(
177-
abs_location, recurse=not shallow, replace_originals=replace_originals, ignore_pattern=ignore)
196+
abs_location,
197+
recurse=not shallow,
198+
replace_originals=replace_originals,
199+
ignore_pattern=ignore,
200+
all_formats=all_formats,
201+
)
178202

179203
if not quiet:
180204
echo_stderr('Extracting archives...', fg='green')

src/extractcode/extract.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,17 @@ def extract(
123123
if recurse and a nested archive is found, it is extracted to full depth
124124
first before resuming the file system walk.
125125
"""
126+
127+
extract_events = extract_files(
128+
location=location,
129+
kinds=kinds,
130+
recurse=recurse,
131+
ignore_pattern=ignore_pattern,
132+
)
133+
126134
processed_events = []
127135
processed_events_append = processed_events.append
128-
for event in extract_files(location, kinds, recurse, ignore_pattern):
136+
for event in extract_events:
129137
yield event
130138
if replace_originals:
131139
processed_events_append(event)
@@ -155,7 +163,9 @@ def extract_files(
155163
Extract only archives of a kind listed in the `kinds` kind tuple.
156164
157165
If `recurse` is True, extract recursively archives nested inside other
158-
archives. If `recurse` is false, then do not extract further an already
166+
archives.
167+
168+
If `recurse` is false, then do not extract further an already
159169
extracted archive identified by the corresponding extract suffix location.
160170
"""
161171
ignored = partial(ignore.is_ignored, ignores=ignore.default_ignores, unignores={})
@@ -193,7 +203,11 @@ def extract_files(
193203
logger.debug('extract:target: %(target)r' % locals())
194204

195205
# extract proper
196-
for xevent in extract_file(loc, target, kinds):
206+
for xevent in extract_file(
207+
location=loc,
208+
target=target,
209+
kinds=kinds,
210+
):
197211
if TRACE:
198212
logger.debug('extract:walk:extraction event: %(xevent)r' % locals())
199213
yield xevent
@@ -217,6 +231,7 @@ def extract_file(
217231
target,
218232
kinds=extractcode.default_kinds,
219233
verbose=False,
234+
all_formats=False,
220235
):
221236
"""
222237
Extract a single archive at `location` in the `target` directory if it is

0 commit comments

Comments
 (0)