Skip to content

Commit d6f8044

Browse files
committed
Add support for VM image extraction #16
THis is a two step extraction using libguestfs to get a FS to a tarball which is then extractcode normally (hence dealing with links, device files and other permission oddities as a side effect). We support VDI (VirtualBox, VMDK (VMware) and QCOW2 (QEMU) Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 056b6c1 commit d6f8044

File tree

2 files changed

+182
-3
lines changed

2 files changed

+182
-3
lines changed

src/extractcode/archive.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from extractcode import libarchive2
4242
from extractcode import patch
4343
from extractcode import sevenzip
44+
from extractcode import vmimage
4445

4546
from extractcode.uncompress import uncompress_gzip
4647
from extractcode.uncompress import uncompress_bzip2
@@ -469,6 +470,7 @@ def try_to_extract(location, target_dir, extractor):
469470
extract_xz = sevenzip.extract
470471
extract_lzma = sevenzip.extract
471472
extract_squashfs = sevenzip.extract
473+
extract_vm_image = vmimage.extract
472474
extract_cab = sevenzip.extract
473475
extract_nsis = sevenzip.extract
474476
extract_ishield = sevenzip.extract
@@ -1034,7 +1036,7 @@ def try_to_extract(location, target_dir, extractor):
10341036
)
10351037

10361038
SquashfsHandler = Handler(
1037-
name='squashfs FS',
1039+
name='SquashFS disk image',
10381040
filetypes=('squashfs',),
10391041
mimetypes=(),
10401042
extensions=(),
@@ -1043,7 +1045,38 @@ def try_to_extract(location, target_dir, extractor):
10431045
strict=False
10441046
)
10451047

1046-
`PatchHandler = Handler(
1048+
QCOWHandler = Handler(
1049+
# note that there are v1, v2 and v3 formats.
1050+
name='QEMU QCOW2 disk image',
1051+
filetypes=('qemu qcow2 image',),
1052+
mimetypes=('application/octet-stream',),
1053+
extensions=('.qcow2',),
1054+
kind=file_system,
1055+
extractors=[extract_vm_image, extract_tar],
1056+
strict=False,
1057+
)
1058+
1059+
VMDKHandler = Handler(
1060+
name='VMDK disk image',
1061+
filetypes=('vmware4 disk image',),
1062+
mimetypes=('application/octet-stream',),
1063+
extensions=('.vmdk',),
1064+
kind=file_system,
1065+
extractors=[extract_vm_image, extract_tar],
1066+
strict=True,
1067+
)
1068+
1069+
VirtualBoxHandler = Handler(
1070+
name='VirtualBox disk image',
1071+
filetypes=('virtualbox disk image',),
1072+
mimetypes=('application/octet-stream',),
1073+
extensions=('.vdi',),
1074+
kind=file_system,
1075+
extractors=[extract_vm_image, extract_tar],
1076+
strict=True,
1077+
)
1078+
1079+
PatchHandler = Handler(
10471080
name='Patch',
10481081
filetypes=('diff', 'patch',),
10491082
mimetypes=('text/x-diff',),
@@ -1111,5 +1144,8 @@ def try_to_extract(location, target_dir, extractor):
11111144
AppleDmgHandler,
11121145
IsoImageHandler,
11131146
SquashfsHandler,
1114-
PatchHandler
1147+
QCOWHandler,
1148+
VMDKHandler,
1149+
VirtualBoxHandler,
1150+
PatchHandler,
11151151
]

src/extractcode/vmimage.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#
2+
# Copyright (c) nexB Inc. and others.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Visit https://aboutcode.org and https://github.com/nexB/ for support and download.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# Licensed under the Apache License, Version 2.0 (the "License");
9+
# you may not use this file except in compliance with the License.
10+
# You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing, software
15+
# distributed under the License is distributed on an "AS IS" BASIS,
16+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
# See the License for the specific language governing permissions and
18+
# limitations under the License.
19+
#
20+
21+
import logging
22+
import os
23+
import shutil
24+
import warnings
25+
26+
from commoncode import command
27+
from commoncode import fileutils
28+
from commoncode.system import on_linux
29+
30+
from extractcode import ExtractErrorFailedToExtract
31+
32+
"""
33+
Support to extract Virtual Machine image formats and the filesystem(s) they
34+
contain. This is based on libguestfs-tools and is tested only on Linux.
35+
Works only if libguestfs tool guestfish is in the path.
36+
37+
See https://libguestfs.org/
38+
39+
On Ubuntu, you may face this issue when running guestfish:
40+
41+
- https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725
42+
- https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662
43+
- https://unix.stackexchange.com/a/642914/185837
44+
"""
45+
46+
logger = logging.getLogger(__name__)
47+
48+
TRACE = False
49+
50+
if TRACE:
51+
import sys
52+
logging.basicConfig(stream=sys.stdout)
53+
logger.setLevel(logging.DEBUG)
54+
55+
GUESTFISH_NOT_FOUND = (
56+
'WARNING: guestfish executable is not installed. '
57+
'Unable to extract virtual machine image: you need to install the '
58+
'guestfish tool from libguestfs and extra FS drivers if needed. '
59+
'See https://libguestfs.org/ for details.'
60+
)
61+
62+
63+
def get_command():
64+
"""
65+
Return the location to the guestfish command or None.
66+
"""
67+
cmd_loc = shutil.which('guestfish') or None
68+
if not cmd_loc:
69+
warnings.warn(GUESTFISH_NOT_FOUND)
70+
71+
return cmd_loc
72+
73+
74+
def extract(location, target_dir):
75+
"""
76+
Extract all files from a guestfish-supported VM image archive file at
77+
location in the target_dir directory as a tarball.
78+
79+
Return a list of warning messages if any or an empty list.
80+
Raise exception on errors.
81+
82+
The extraction has a side effect to always create an intermediate tarball.
83+
This tarball will be created as a temporary file and deleted on success.
84+
85+
This works only on Linux.
86+
"""
87+
if not on_linux:
88+
raise ExtractErrorFailedToExtract(
89+
f'VM Image extraction only supported on Linux for: {location}')
90+
91+
assert location
92+
abs_location = os.path.abspath(os.path.expanduser(location))
93+
if not os.path.exists(abs_location):
94+
raise ExtractErrorFailedToExtract(
95+
f'The system cannot find the path specified: {abs_location}')
96+
97+
assert target_dir
98+
abs_target_dir = os.path.abspath(os.path.expanduser(target_dir))
99+
if not os.path.exists(abs_target_dir):
100+
raise ExtractErrorFailedToExtract(
101+
f'The system cannot find the target path specified: {target_dir}')
102+
103+
cmd_loc = get_command()
104+
if not cmd_loc:
105+
raise ExtractErrorFailedToExtract(GUESTFISH_NOT_FOUND)
106+
107+
supported_gfs_formats_by_extension = {
108+
'.qcow2': 'qcow2',
109+
'.vmdk': 'vmdk',
110+
'.vdi': 'vdi',
111+
}
112+
extension = fileutils.file_extension(location)
113+
image_format = supported_gfs_formats_by_extension.get(extension)
114+
115+
if not image_format:
116+
raise ExtractErrorFailedToExtract(f'Unsupported image format: {location}')
117+
118+
filename = fileutils.file_name(location)
119+
120+
target_tarball = os.path.join(target_dir, f'{filename}.tar.gz')
121+
122+
args = [
123+
'--ro',
124+
f'--format={image_format}',
125+
'--inspector',
126+
'tar-out',
127+
'--add' , location,
128+
'/', target_tarball,
129+
'compress:gzip',
130+
]
131+
132+
rc, stdout, stderr = command.execute2(cmd_loc=cmd_loc, args=args)
133+
134+
if rc != 0:
135+
if TRACE:
136+
logger.debug(
137+
f'extract: failure: {rc}\n'
138+
f'stderr: {stderr}\n'
139+
f'stdout: {stdout}\n')
140+
error = f'{stdout}\n{stderr}'
141+
raise ExtractErrorFailedToExtract(error)
142+
143+
return []

0 commit comments

Comments
 (0)