Skip to content

Allow user to specify encoding #137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 76 additions & 44 deletions pycdlib/pycdlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,8 +705,8 @@ def _seek_to_extent(self, extent):
self._cdfp.seek(extent * self.logical_block_size)

@functools.lru_cache(maxsize=256)
def _find_iso_record(self, iso_path):
# type: (bytes) -> dr.DirectoryRecord
def _find_iso_record(self, iso_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
An internal method to find a directory record on the ISO given an ISO
path. If the entry is found, it returns the directory record object
Expand All @@ -715,14 +715,15 @@ def _find_iso_record(self, iso_path):

Parameters:
iso_path - The ISO9660 path to lookup.
encoding - The string encoding used for the path.
Returns:
The directory record entry representing the entry on the ISO.
"""
return _find_dr_record_by_name(self.pvd, iso_path, 'utf-8')
return _find_dr_record_by_name(self.pvd, iso_path, encoding)

@functools.lru_cache(maxsize=256)
def _find_rr_record(self, rr_path):
# type: (bytes) -> dr.DirectoryRecord
def _find_rr_record(self, rr_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
An internal method to find a directory record on the ISO given a Rock
Ridge path. If the entry is found, it returns the directory record
Expand All @@ -731,6 +732,7 @@ def _find_rr_record(self, rr_path):

Parameters:
rr_path - The Rock Ridge path to lookup.
encoding - The string encoding used for the path.
Returns:
The directory record entry representing the entry on the ISO.
"""
Expand All @@ -742,7 +744,7 @@ def _find_rr_record(self, rr_path):

splitpath = utils.split_path(rr_path)

currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)

entry = root_dir_record

Expand Down Expand Up @@ -793,13 +795,13 @@ def _find_rr_record(self, rr_path):
if not child.is_dir():
break
entry = child
currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)

raise pycdlibexception.PyCdlibInvalidInput('Could not find path')

@functools.lru_cache(maxsize=256)
def _find_joliet_record(self, joliet_path):
# type: (bytes) -> dr.DirectoryRecord
def _find_joliet_record(self, joliet_path, encoding='utf-16_be'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
An internal method to find a directory record on the ISO given a Joliet
path. If the entry is found, it returns the directory record object
Expand All @@ -808,12 +810,13 @@ def _find_joliet_record(self, joliet_path):

Parameters:
joliet_path - The Joliet path to lookup.
encoding - The string encoding used for the path.
Returns:
The directory record entry representing the entry on the ISO.
"""
if self.joliet_vd is None:
raise pycdlibexception.PyCdlibInternalError('Joliet path requested on non-Joliet ISO')
return _find_dr_record_by_name(self.joliet_vd, joliet_path, 'utf-16_be')
return _find_dr_record_by_name(self.joliet_vd, joliet_path, encoding)

@functools.lru_cache(maxsize=256)
def _find_udf_record(self, udf_path):
Expand Down Expand Up @@ -2412,8 +2415,8 @@ def _udf_get_file_from_iso_fp(self, outfp, blocksize, udf_path):
utils.copy_data(data_len, blocksize, data_fp, outfp)

def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
joliet_path):
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes]) -> None
joliet_path, encoding=''):
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes], str) -> None
"""
An internal method to fetch a single file from the ISO and write it out
to the file object.
Expand All @@ -2427,19 +2430,23 @@ def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
with iso_path and joliet_path).
joliet_path - The absolute Joliet path to lookup on the ISO (exclusive
with iso_path and rr_path).
encoding - The string encoding used for the path.
Returns:
Nothing.
"""
if joliet_path is not None:
if self.joliet_vd is None:
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a joliet_path from a non-Joliet ISO')
found_record = self._find_joliet_record(joliet_path)
encoding = encoding or 'utf-16_be'
found_record = self._find_joliet_record(joliet_path, encoding)
elif rr_path is not None:
if not self.rock_ridge:
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a rr_path from a non-Rock Ridge ISO')
found_record = self._find_rr_record(rr_path)
encoding = encoding or 'utf-8'
found_record = self._find_rr_record(rr_path, encoding)
elif iso_path is not None:
found_record = self._find_iso_record(iso_path)
encoding = encoding or 'utf-8'
found_record = self._find_iso_record(iso_path, encoding)
else:
raise pycdlibexception.PyCdlibInternalError('Invalid path passed to get_file_from_iso_fp')

Expand Down Expand Up @@ -3472,52 +3479,55 @@ def _rm_joliet_dir(self, joliet_path):

return num_bytes_to_remove

def _get_iso_entry(self, iso_path):
# type: (bytes) -> dr.DirectoryRecord
def _get_iso_entry(self, iso_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
Internal method to get the directory record for an ISO path.

Parameters:
iso_path - The path on the ISO filesystem to look up the record for.
encoding - The string encoding used for the path.
Returns:
A dr.DirectoryRecord object representing the path.
"""
if self._needs_reshuffle:
self._reshuffle_extents()

return self._find_iso_record(iso_path)
return self._find_iso_record(iso_path, encoding)

def _get_rr_entry(self, rr_path):
# type: (bytes) -> dr.DirectoryRecord
def _get_rr_entry(self, rr_path, encoding='utf-8'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
Internal method to get the directory record for a Rock Ridge path.

Parameters:
rr_path - The Rock Ridge path on the ISO filesystem to look up the
record for.
encoding - The string encoding used for the path.
Returns:
A dr.DirectoryRecord object representing the path.
"""
if self._needs_reshuffle:
self._reshuffle_extents()

return self._find_rr_record(rr_path)
return self._find_rr_record(rr_path, encoding)

def _get_joliet_entry(self, joliet_path):
# type: (bytes) -> dr.DirectoryRecord
def _get_joliet_entry(self, joliet_path, encoding='utf-16_be'):
# type: (bytes, str) -> dr.DirectoryRecord
"""
Internal method to get the directory record for a Joliet path.

Parameters:
joliet_path - The path on the Joliet filesystem to look up the record
for.
encoding - The string encoding used for the path.
Returns:
A dr.DirectoryRecord object representing the path.
"""
if self._needs_reshuffle:
self._reshuffle_extents()

return self._find_joliet_record(joliet_path)
return self._find_joliet_record(joliet_path, encoding)

def _get_udf_entry(self, udf_path):
# type: (str) -> udfmod.UDFFileEntry
Expand Down Expand Up @@ -4103,6 +4113,7 @@ def get_file_from_iso(self, local_path, **kwargs):
with iso_path, rr_path, and udf_path).
udf_path - The absolute UDF path to lookup on the ISO (exclusive with
iso_path, rr_path, and joliet_path).
encoding - The encoding to use for parsing the filenames.
Returns:
Nothing.
"""
Expand All @@ -4114,6 +4125,7 @@ def get_file_from_iso(self, local_path, **kwargs):
iso_path = None
rr_path = None
udf_path = None
encoding = ''
num_paths = 0
for key, value in kwargs.items():
if key == 'blocksize':
Expand Down Expand Up @@ -4144,6 +4156,10 @@ def get_file_from_iso(self, local_path, **kwargs):
num_paths += 1
elif value is not None:
raise pycdlibexception.PyCdlibInvalidInput('iso_path must be a string')
elif key == 'encoding':
if not isinstance(value, str):
raise pycdlibexception.PyCdlibInvalidInput('encoding must be a string')
encoding = value
else:
raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key))

Expand All @@ -4155,7 +4171,7 @@ def get_file_from_iso(self, local_path, **kwargs):
self._udf_get_file_from_iso_fp(fp, blocksize, udf_path)
else:
self._get_file_from_iso_fp(fp, blocksize, iso_path, rr_path,
joliet_path)
joliet_path, encoding)

def get_file_from_iso_fp(self, outfp, **kwargs):
# type: (BinaryIO, Union[str, int]) -> None
Expand All @@ -4173,6 +4189,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
with iso_path, rr_path, and udf_path).
udf_path - The absolute UDF path to lookup on the ISO (exclusive with
iso_path, rr_path, and joliet_path).
encoding - The encoding to use for parsing the filenames.
Returns:
Nothing.
"""
Expand All @@ -4184,6 +4201,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
iso_path = None
rr_path = None
udf_path = None
encoding = None
num_paths = 0
for key, value in kwargs.items():
if key == 'blocksize':
Expand Down Expand Up @@ -4214,6 +4232,10 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
num_paths += 1
elif value is not None:
raise pycdlibexception.PyCdlibInvalidInput('udf_path must be a string')
elif key == 'encoding':
if not isinstance(value, str):
raise pycdlibexception.PyCdlibInvalidInput('encoding must be a string')
encoding = value
else:
raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key))

Expand All @@ -4224,7 +4246,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
self._udf_get_file_from_iso_fp(outfp, blocksize, udf_path)
else:
self._get_file_from_iso_fp(outfp, blocksize, iso_path, rr_path,
joliet_path)
joliet_path, encoding)

def get_and_write(self, iso_path, local_path, blocksize=8192):
# type: (str, str, int) -> None
Expand Down Expand Up @@ -5450,6 +5472,7 @@ def list_children(self, **kwargs):
rr_path - The absolute Rock Ridge path on the ISO to list the children for.
joliet_path - The absolute Joliet path on the ISO to list the children for.
udf_path - The absolute UDF path on the ISO to list the children for.
encoding - The string encoding used for the path; defaults to 'utf-8' or 'utf-16_be'
Yields:
Children of this path.
Returns:
Expand All @@ -5463,6 +5486,8 @@ def list_children(self, **kwargs):
if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'):
if value is not None:
num_paths += 1
elif key in ('encoding'):
continue
else:
raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'")

Expand All @@ -5480,12 +5505,15 @@ def list_children(self, **kwargs):
else:
use_rr = False
if 'joliet_path' in kwargs:
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']))
kwargs['encoding'] = kwargs.get('encoding') or 'utf-16_be'
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']), kwargs['encoding'])
elif 'rr_path' in kwargs:
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']))
kwargs['encoding'] = kwargs.get('encoding') or 'utf-8'
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']), kwargs['encoding'])
use_rr = True
else:
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']))
kwargs['encoding'] = kwargs.get('encoding') or 'utf-8'
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']), kwargs['encoding'])

for c in _yield_children(rec, use_rr): # pylint: disable=use-yield-from
yield c
Expand Down Expand Up @@ -5630,14 +5658,15 @@ def rm_isohybrid(self):

self.isohybrid_mbr = None

def full_path_from_dirrecord(self, rec, rockridge=False):
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool) -> str
def full_path_from_dirrecord(self, rec, rockridge=False, user_encoding=''):
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool, str) -> str
"""
Get the absolute path of a directory record.

Parameters:
rec - The directory record to get the full path for.
rockridge - Whether to get the rock ridge full path.
user_encoding - The string encoding used for the path as determined by the user.
Returns:
A string representing the absolute path to the file on the ISO.
"""
Expand All @@ -5650,6 +5679,9 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
if self.joliet_vd is not None and id(rec.vd) == id(self.joliet_vd):
encoding = 'utf-16_be'

if user_encoding:
encoding = user_encoding

# A root entry has no Rock Ridge entry, even on a Rock Ridge ISO.
# Always return / here.
if rec.is_root:
Expand Down Expand Up @@ -5689,6 +5721,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
encoding = rec.file_ident.encoding
else:
encoding = 'utf-8'
if user_encoding:
encoding = user_encoding
udf_rec = rec # type: Optional[udfmod.UDFFileEntry]
while udf_rec is not None:
ident = udf_rec.file_identifier()
Expand Down Expand Up @@ -5859,12 +5893,11 @@ def walk(self, **kwargs):
raise pycdlibexception.PyCdlibInvalidInput('This object is not initialized; call either open() or new() to create an ISO')

num_paths = 0
user_encoding = None
user_encoding = ''
for key, value in kwargs.items():
if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'):
if value is not None:
num_paths += 1
elif key == 'encoding':
if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path') and value is not None:
num_paths += 1
elif key == 'encoding' and value:
user_encoding = value
else:
raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'")
Expand Down Expand Up @@ -5901,23 +5934,22 @@ def walk(self, **kwargs):
while dirs:
dir_record = dirs.popleft()

relpath = self.full_path_from_dirrecord(dir_record,
rockridge=path_type == 'rr_path')
relpath = self.full_path_from_dirrecord(dir_record, rockridge=path_type == 'rr_path',
user_encoding=user_encoding)
dirlist = []
filelist = []
dirdict = {}

for child in reversed(list(self.list_children(**{path_type: relpath}))):
for child in reversed(list(self.list_children(**{path_type: relpath, 'encoding': user_encoding or default_encoding}))):
if child is None or child.is_dot() or child.is_dotdot():
continue

if user_encoding is not None:
if user_encoding != '':
encoding = user_encoding
elif isinstance(child, udfmod.UDFFileEntry) and child.file_ident is not None:
encoding = child.file_ident.encoding
else:
if isinstance(child, udfmod.UDFFileEntry) and child.file_ident is not None:
encoding = child.file_ident.encoding
else:
encoding = default_encoding
encoding = default_encoding or 'utf-8'

if path_type == 'rr_path':
name = child.rock_ridge.name()
Expand Down