Skip to content

Commit 4a8ef69

Browse files
committed
Improve documentation and readability
Apply formatting and minor refactoring. Refine and carify documentation. Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent bbbffbc commit 4a8ef69

15 files changed

+500
-237
lines changed

extractcode.ABOUT

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
about_resource: .
22
copyright: copyright (c) nexB. Inc. and others
3-
description: A mostly universal archive extractor using z7zip, libarchve, other
4-
libraries and the Python standard library for reliable archive extraction.
5-
It is used by ScanCode toolkit and related projects
6-
keywords: archive, extraction, libarchive, 7zip, scancode-toolkit
3+
description: A mostly universal archive extractor using 7zip, libarchive and the
4+
Python standard library for reliable archive extraction on Linux, Windows and
5+
macOS. It is used by ScanCode toolkit and related projects.
6+
keywords: archive, extraction, libarchive, 7zip, gzip, xz, lzma, bzip2, tar, ar, cpio, scancode-toolkit
77
homepage_url: https://github.com/nexB/extractcode
88
holder: nexB. Inc. and others
99
holder_contact: info@aboutcode.org

setup.cfg

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ author_email = info@aboutcode.org
1010
license = Apache-2.0
1111

1212
# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
13-
description = A mostly universal archive extractor using z7zip, libarchve, other libraries and the Python standard library for reliable archive extraction.
13+
description = A mostly universal archive extractor using 7zip, libarchive and the Python standard library for reliable archive extraction.
1414
long_description = file:README.rst
1515
url = https://github.com/nexB/extractcode
1616
classifiers =
@@ -26,6 +26,27 @@ keywords =
2626
extraction
2727
libarchive
2828
7zip
29+
7z
30+
gzip
31+
bzip2
32+
xz
33+
lzma
34+
lz4
35+
lzip
36+
zstd
37+
Z
38+
tar
39+
xar
40+
ar
41+
cpio
42+
vmdk
43+
qcow2
44+
vhd
45+
iso
46+
deb
47+
cab
48+
rpm
49+
patch
2950
scancode-toolkit
3051

3152
[options]

src/extractcode/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,11 @@ def remove_backslashes_and_dotdots(directory):
137137
def new_name(location, is_dir=False):
138138
"""
139139
Return a new non-existing location from a `location` usable to write a file
140-
or create directory without overwriting existing files or directories in the same
141-
parent directory, ignoring the case of the filename.
140+
or create directory without overwriting existing files or directories in the
141+
same parent directory, ignoring the case of the filename.
142142
143-
The case of the filename is ignored to ensure that similar results are returned
144-
across case sensitive (*nix) and case insensitive file systems.
143+
The case of the filename is ignored to ensure that similar results are
144+
returned across case sensitive (*nix) and case insensitive file systems.
145145
146146
To find a new unique filename, this tries new names this way:
147147
* pad a directory name with _X where X is an incremented number.

src/extractcode/archive.py

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from commoncode import filetype
1616
from commoncode import functional
1717
from commoncode.ignore import is_ignored
18-
1918
from typecode import contenttype
2019

2120
from extractcode import all_kinds
@@ -204,7 +203,9 @@ def get_handlers(location):
204203
mtype = T.mimetype_file
205204

206205
if TRACE_DEEP:
207-
logger.debug('get_handlers: processing %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals())
206+
logger.debug(
207+
'get_handlers: processing %(location)s: '
208+
'ftype: %(ftype)s, mtype: %(mtype)s ' % locals())
208209
for handler in archive_handlers:
209210
if not handler.extractors:
210211
continue
@@ -223,9 +224,19 @@ def get_handlers(location):
223224
extension_matched = exts and location.lower().endswith(exts)
224225

225226
if TRACE_DEEP:
226-
print(f' get_handlers: matched type: {type_matched}, mime: {mime_matched}, ext: {extension_matched}' % locals())
227-
228-
if handler.strict and not (type_matched and mime_matched and extension_matched):
227+
print(
228+
f' get_handlers: matched type: {type_matched}, '
229+
f'mime: {mime_matched}, ext: {extension_matched}' % locals()
230+
)
231+
232+
if (
233+
handler.strict
234+
and not (
235+
type_matched
236+
and mime_matched
237+
and extension_matched
238+
)
239+
):
229240
if TRACE_DEEP:
230241
print(f' get_handlers: skip strict: {handler.name}')
231242
continue
@@ -449,17 +460,30 @@ def try_to_extract(location, target_dir, extractor):
449460

450461
extract_deb = libarchive2.extract
451462

452-
# sevenzip is best for windows lib formats and works fine otherwise. libarchive works on standard ar formats.
453-
extract_ar = functional.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract)
463+
# sevenzip is best for windows lib formats and works fine otherwise. libarchive
464+
# works on standard ar formats.
465+
extract_ar = functional.partial(
466+
extract_with_fallback,
467+
extractor1=libarchive2.extract,
468+
extractor2=sevenzip.extract,
469+
)
454470

455471
extract_msi = sevenzip.extract
456472
extract_cpio = libarchive2.extract
457473

458474
# sevenzip should be best at extracting 7zip but most often libarchive is better first
459-
extract_7z = functional.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract)
475+
extract_7z = functional.partial(
476+
extract_with_fallback,
477+
extractor1=libarchive2.extract,
478+
extractor2=sevenzip.extract,
479+
)
460480

461481
# libarchive is best for the run of the mill zips, but sevenzip sometimes is better
462-
extract_zip = functional.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract)
482+
extract_zip = functional.partial(
483+
extract_with_fallback,
484+
extractor1=libarchive2.extract,
485+
extractor2=sevenzip.extract,
486+
)
463487

464488
extract_springboot = functional.partial(try_to_extract, extractor=extract_zip)
465489

@@ -515,7 +539,12 @@ def try_to_extract(location, target_dir, extractor):
515539

516540
OfficeDocHandler = Handler(
517541
name='Office doc',
518-
filetypes=('zip archive', 'microsoft word 2007+', 'microsoft excel 2007+', 'microsoft powerpoint 2007+'),
542+
filetypes=(
543+
'zip archive',
544+
'microsoft word 2007+',
545+
'microsoft excel 2007+',
546+
'microsoft powerpoint 2007+',
547+
),
519548
mimetypes=('application/zip', 'application/vnd.openxmlformats',),
520549
# Extensions of office documents that are zip files too
521550
extensions=(
@@ -553,7 +582,7 @@ def try_to_extract(location, target_dir, extractor):
553582
strict=True
554583
)
555584

556-
# see http://tools.android.com/tech-docs/new-build-system/aar-formats
585+
# see http://tools.android.com/tech-docs/new-build-system/aar-formats
557586
AndroidLibHandler = Handler(
558587
name='Android library',
559588
filetypes=('zip archive',),
@@ -827,8 +856,16 @@ def try_to_extract(location, target_dir, extractor):
827856
name='Tar bzip2',
828857
filetypes=('bzip2 compressed',),
829858
mimetypes=('application/x-bzip2',),
830-
extensions=('.tar.bz2', '.tar.bz', '.tar.bzip', '.tar.bzip2',
831-
'.tbz', '.tbz2', '.tb2', '.tarbz2',),
859+
extensions=(
860+
'.tar.bz2',
861+
'.tar.bz',
862+
'.tar.bzip',
863+
'.tar.bzip2',
864+
'.tbz',
865+
'.tbz2',
866+
'.tb2',
867+
'.tarbz2',
868+
),
832869
kind=regular_nested,
833870
extractors=[extract_tar],
834871
strict=False
@@ -876,10 +913,11 @@ def try_to_extract(location, target_dir, extractor):
876913

877914
NugetHandler = Handler(
878915
name='Nuget',
879-
# weirdly enough the detection by libmagic is sometimes wrong
880-
# TODO file a bug upstream
881-
# this is due to this: https://en.wikipedia.org/wiki/Open_Packaging_Conventions#File_formats_using_the_OPC
916+
# TODO: file a bug upstream
917+
# Weirdly enough the detection by libmagic is sometimes wrong
918+
# this is due to this issue:
882919
# being recognized by libmagic as an OOXML file
920+
# https://en.wikipedia.org/wiki/Open_Packaging_Conventions#File_formats_using_the_OPC
883921
filetypes=('zip archive', 'microsoft ooxml',),
884922
mimetypes=('application/zip', 'application/octet-stream',),
885923
extensions=('.nupkg',),
@@ -921,7 +959,10 @@ def try_to_extract(location, target_dir, extractor):
921959
DebHandler = Handler(
922960
name='Debian package',
923961
filetypes=('debian binary package',),
924-
mimetypes=('application/vnd.debian.binary-package', 'application/x-archive',),
962+
mimetypes=(
963+
'application/vnd.debian.binary-package',
964+
'application/x-archive',
965+
),
925966
extensions=('.deb', '.udeb',),
926967
kind=package,
927968
extractors=[extract_deb],

src/extractcode/cli.py

Lines changed: 84 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -84,28 +84,85 @@ class ExtractCommand(cliutils.BaseCommand):
8484
@click.command(name='extractcode', epilog=epilog_text, cls=ExtractCommand)
8585
@click.pass_context
8686

87-
@click.argument('input', metavar='<input>', type=click.Path(exists=True, readable=True))
88-
89-
@click.option('--verbose', is_flag=True, default=False, help='Print verbose file-by-file progress messages.')
90-
@click.option('--quiet', is_flag=True, default=False, help='Do not print any summary or progress message.')
91-
@click.option('--shallow', is_flag=True, default=False, help='Do not extract recursively nested archives (e.g. not archives in archives).')
92-
@click.option('--replace-originals', is_flag=True, default=False, help='Replace extracted archives by the extracted content.')
93-
@click.option('--ignore', default=[], multiple=True, help='Ignore files/directories following a glob-pattern.')
94-
@click.option('--all-formats', is_flag=True, default=False, help='Extract archives from all known formats.')
87+
@click.argument(
88+
'input',
89+
metavar='<input>',
90+
type=click.Path(exists=True, readable=True),
91+
)
92+
93+
@click.option(
94+
'--verbose',
95+
is_flag=True,
96+
help='Print verbose file-by-file progress messages.',
97+
)
98+
@click.option(
99+
'--quiet',
100+
is_flag=True,
101+
help='Do not print any summary or progress message.',
102+
)
103+
@click.option(
104+
'--shallow',
105+
is_flag=True,
106+
help='Do not extract recursively nested archives in archives.',
107+
)
108+
@click.option(
109+
'--replace-originals',
110+
is_flag=True,
111+
help='Replace extracted archives by the extracted content.',
112+
)
113+
@click.option(
114+
'--ignore',
115+
default=[],
116+
multiple=True,
117+
help='Ignore files/directories matching this glob pattern.',
118+
)
119+
120+
@click.option(
121+
'--all-formats',
122+
is_flag=True,
123+
help='Extract archives from all known formats.',
124+
)
95125

96126
@click.help_option('-h', '--help')
97-
@click.option('--about', is_flag=True, is_eager=True, callback=print_about, help='Show information about ExtractCode and licensing and exit.')
98-
@click.option('--version', is_flag=True, is_eager=True, callback=print_version, help='Show the version and exit.')
99-
def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, all_formats, *args, **kwargs): # NOQA
100-
"""extract archives and compressed files found in the <input> file or directory tree.
127+
@click.option(
128+
'--about',
129+
is_flag=True,
130+
is_eager=True,
131+
callback=print_about,
132+
help='Show information about ExtractCode and its licensing and exit.',
133+
)
134+
@click.option(
135+
'--version',
136+
is_flag=True,
137+
is_eager=True,
138+
callback=print_version,
139+
help='Show the version and exit.',
140+
)
141+
def extractcode(
142+
ctx,
143+
input, # NOQA
144+
verbose,
145+
quiet,
146+
shallow,
147+
replace_originals,
148+
ignore,
149+
all_formats,
150+
*args,
151+
**kwargs,
152+
):
153+
"""extract archives and compressed files in the <input> file or directory tree.
101154
102155
Archives found inside an extracted archive are extracted recursively.
103156
Use --shallow for a shallow extraction.
104157
Extraction for each archive is done in-place in a new directory named
105158
'<archive file name>-extract' created side-by-side with an archive.
106159
"""
107160

108-
abs_location = fileutils.as_posixpath(os.path.abspath(os.path.expanduser(input)))
161+
abs_location = fileutils.as_posixpath(
162+
os.path.abspath(
163+
os.path.expanduser(input)
164+
)
165+
)
109166

110167
def extract_event(item):
111168
"""
@@ -159,10 +216,16 @@ def display_extract_summary():
159216
)
160217

161218
for e in xev.errors:
162-
echo_stderr('ERROR extracting: %(source)s: %(e)s' % locals(), fg='red')
219+
echo_stderr(
220+
'ERROR extracting: %(source)s: %(e)s' % locals(),
221+
fg='red'
222+
)
163223

164224
for warn in xev.warnings:
165-
echo_stderr('WARNING extracting: %(source)s: %(warn)s' % locals(), fg='yellow')
225+
echo_stderr(
226+
'WARNING extracting: %(source)s: %(warn)s' % locals(),
227+
fg='yellow'
228+
)
166229

167230
summary_color = 'green'
168231
if has_warnings:
@@ -190,6 +253,7 @@ def display_extract_summary():
190253

191254
if not quiet:
192255
echo_stderr('Extracting archives...', fg='green')
256+
193257
with cliutils.progressmanager(extractibles,
194258
item_show_func=extract_event, verbose=verbose) as extraction_events:
195259

@@ -199,7 +263,9 @@ def display_extract_summary():
199263
if repr(xev) not in unique_extract_events_with_errors:
200264
extract_result_with_errors.append(xev)
201265
unique_extract_events_with_errors.add(repr(xev))
266+
202267
display_extract_summary()
268+
203269
else:
204270
for xev in extractibles:
205271
if xev.done and (xev.warnings or xev.errors):
@@ -211,9 +277,9 @@ def display_extract_summary():
211277

212278
def get_relative_path(path, len_base_path, base_is_dir):
213279
"""
214-
Return a posix relative path from the posix 'path' relative to a
215-
base path of `len_base_path` length where the base is a directory if
216-
`base_is_dir` True or a file otherwise.
280+
Return a posix relative path from the posix 'path' relative to a base path
281+
of `len_base_path` length where the base is a directory if `base_is_dir`
282+
True or a file otherwise.
217283
"""
218284
path = os.fsdecode(path)
219285
if base_is_dir:

0 commit comments

Comments
 (0)