diff --git a/pycdlib/pycdlib.py b/pycdlib/pycdlib.py index 2a9ef720..dbe68a15 100644 --- a/pycdlib/pycdlib.py +++ b/pycdlib/pycdlib.py @@ -705,8 +705,8 @@ def _seek_to_extent(self, extent): self._cdfp.seek(extent * self.logical_block_size) @functools.lru_cache(maxsize=256) - def _find_iso_record(self, iso_path): - # type: (bytes) -> dr.DirectoryRecord + def _find_iso_record(self, iso_path, encoding='utf-8'): + # type: (bytes, str) -> dr.DirectoryRecord """ An internal method to find a directory record on the ISO given an ISO path. If the entry is found, it returns the directory record object @@ -715,14 +715,15 @@ def _find_iso_record(self, iso_path): Parameters: iso_path - The ISO9660 path to lookup. + encoding - The string encoding used for the path. Returns: The directory record entry representing the entry on the ISO. """ - return _find_dr_record_by_name(self.pvd, iso_path, 'utf-8') + return _find_dr_record_by_name(self.pvd, iso_path, encoding) @functools.lru_cache(maxsize=256) - def _find_rr_record(self, rr_path): - # type: (bytes) -> dr.DirectoryRecord + def _find_rr_record(self, rr_path, encoding='utf-8'): + # type: (bytes, str) -> dr.DirectoryRecord """ An internal method to find a directory record on the ISO given a Rock Ridge path. If the entry is found, it returns the directory record @@ -731,6 +732,7 @@ def _find_rr_record(self, rr_path): Parameters: rr_path - The Rock Ridge path to lookup. + encoding - The string encoding used for the path. Returns: The directory record entry representing the entry on the ISO. """ @@ -742,7 +744,7 @@ def _find_rr_record(self, rr_path): splitpath = utils.split_path(rr_path) - currpath = splitpath.pop(0).decode('utf-8').encode('utf-8') + currpath = splitpath.pop(0).decode('utf-8').encode(encoding) entry = root_dir_record @@ -793,13 +795,13 @@ def _find_rr_record(self, rr_path): if not child.is_dir(): break entry = child - currpath = splitpath.pop(0).decode('utf-8').encode('utf-8') + currpath = splitpath.pop(0).decode('utf-8').encode(encoding) raise pycdlibexception.PyCdlibInvalidInput('Could not find path') @functools.lru_cache(maxsize=256) - def _find_joliet_record(self, joliet_path): - # type: (bytes) -> dr.DirectoryRecord + def _find_joliet_record(self, joliet_path, encoding='utf-16_be'): + # type: (bytes, str) -> dr.DirectoryRecord """ An internal method to find a directory record on the ISO given a Joliet path. If the entry is found, it returns the directory record object @@ -808,12 +810,13 @@ def _find_joliet_record(self, joliet_path): Parameters: joliet_path - The Joliet path to lookup. + encoding - The string encoding used for the path. Returns: The directory record entry representing the entry on the ISO. """ if self.joliet_vd is None: raise pycdlibexception.PyCdlibInternalError('Joliet path requested on non-Joliet ISO') - return _find_dr_record_by_name(self.joliet_vd, joliet_path, 'utf-16_be') + return _find_dr_record_by_name(self.joliet_vd, joliet_path, encoding) @functools.lru_cache(maxsize=256) def _find_udf_record(self, udf_path): @@ -2412,8 +2415,8 @@ def _udf_get_file_from_iso_fp(self, outfp, blocksize, udf_path): utils.copy_data(data_len, blocksize, data_fp, outfp) def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path, - joliet_path): - # type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes]) -> None + joliet_path, encoding=''): + # type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes], str) -> None """ An internal method to fetch a single file from the ISO and write it out to the file object. @@ -2427,19 +2430,23 @@ def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path, with iso_path and joliet_path). joliet_path - The absolute Joliet path to lookup on the ISO (exclusive with iso_path and rr_path). + encoding - The string encoding used for the path. Returns: Nothing. """ if joliet_path is not None: if self.joliet_vd is None: raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a joliet_path from a non-Joliet ISO') - found_record = self._find_joliet_record(joliet_path) + encoding = encoding or 'utf-16_be' + found_record = self._find_joliet_record(joliet_path, encoding) elif rr_path is not None: if not self.rock_ridge: raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a rr_path from a non-Rock Ridge ISO') - found_record = self._find_rr_record(rr_path) + encoding = encoding or 'utf-8' + found_record = self._find_rr_record(rr_path, encoding) elif iso_path is not None: - found_record = self._find_iso_record(iso_path) + encoding = encoding or 'utf-8' + found_record = self._find_iso_record(iso_path, encoding) else: raise pycdlibexception.PyCdlibInternalError('Invalid path passed to get_file_from_iso_fp') @@ -3472,52 +3479,55 @@ def _rm_joliet_dir(self, joliet_path): return num_bytes_to_remove - def _get_iso_entry(self, iso_path): - # type: (bytes) -> dr.DirectoryRecord + def _get_iso_entry(self, iso_path, encoding='utf-8'): + # type: (bytes, str) -> dr.DirectoryRecord """ Internal method to get the directory record for an ISO path. Parameters: iso_path - The path on the ISO filesystem to look up the record for. + encoding - The string encoding used for the path. Returns: A dr.DirectoryRecord object representing the path. """ if self._needs_reshuffle: self._reshuffle_extents() - return self._find_iso_record(iso_path) + return self._find_iso_record(iso_path, encoding) - def _get_rr_entry(self, rr_path): - # type: (bytes) -> dr.DirectoryRecord + def _get_rr_entry(self, rr_path, encoding='utf-8'): + # type: (bytes, str) -> dr.DirectoryRecord """ Internal method to get the directory record for a Rock Ridge path. Parameters: rr_path - The Rock Ridge path on the ISO filesystem to look up the record for. + encoding - The string encoding used for the path. Returns: A dr.DirectoryRecord object representing the path. """ if self._needs_reshuffle: self._reshuffle_extents() - return self._find_rr_record(rr_path) + return self._find_rr_record(rr_path, encoding) - def _get_joliet_entry(self, joliet_path): - # type: (bytes) -> dr.DirectoryRecord + def _get_joliet_entry(self, joliet_path, encoding='utf-16_be'): + # type: (bytes, str) -> dr.DirectoryRecord """ Internal method to get the directory record for a Joliet path. Parameters: joliet_path - The path on the Joliet filesystem to look up the record for. + encoding - The string encoding used for the path. Returns: A dr.DirectoryRecord object representing the path. """ if self._needs_reshuffle: self._reshuffle_extents() - return self._find_joliet_record(joliet_path) + return self._find_joliet_record(joliet_path, encoding) def _get_udf_entry(self, udf_path): # type: (str) -> udfmod.UDFFileEntry @@ -4103,6 +4113,7 @@ def get_file_from_iso(self, local_path, **kwargs): with iso_path, rr_path, and udf_path). udf_path - The absolute UDF path to lookup on the ISO (exclusive with iso_path, rr_path, and joliet_path). + encoding - The encoding to use for parsing the filenames. Returns: Nothing. """ @@ -4114,6 +4125,7 @@ def get_file_from_iso(self, local_path, **kwargs): iso_path = None rr_path = None udf_path = None + encoding = '' num_paths = 0 for key, value in kwargs.items(): if key == 'blocksize': @@ -4144,6 +4156,10 @@ def get_file_from_iso(self, local_path, **kwargs): num_paths += 1 elif value is not None: raise pycdlibexception.PyCdlibInvalidInput('iso_path must be a string') + elif key == 'encoding': + if not isinstance(value, str): + raise pycdlibexception.PyCdlibInvalidInput('encoding must be a string') + encoding = value else: raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key)) @@ -4155,7 +4171,7 @@ def get_file_from_iso(self, local_path, **kwargs): self._udf_get_file_from_iso_fp(fp, blocksize, udf_path) else: self._get_file_from_iso_fp(fp, blocksize, iso_path, rr_path, - joliet_path) + joliet_path, encoding) def get_file_from_iso_fp(self, outfp, **kwargs): # type: (BinaryIO, Union[str, int]) -> None @@ -4173,6 +4189,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs): with iso_path, rr_path, and udf_path). udf_path - The absolute UDF path to lookup on the ISO (exclusive with iso_path, rr_path, and joliet_path). + encoding - The encoding to use for parsing the filenames. Returns: Nothing. """ @@ -4184,6 +4201,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs): iso_path = None rr_path = None udf_path = None + encoding = None num_paths = 0 for key, value in kwargs.items(): if key == 'blocksize': @@ -4214,6 +4232,10 @@ def get_file_from_iso_fp(self, outfp, **kwargs): num_paths += 1 elif value is not None: raise pycdlibexception.PyCdlibInvalidInput('udf_path must be a string') + elif key == 'encoding': + if not isinstance(value, str): + raise pycdlibexception.PyCdlibInvalidInput('encoding must be a string') + encoding = value else: raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key)) @@ -4224,7 +4246,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs): self._udf_get_file_from_iso_fp(outfp, blocksize, udf_path) else: self._get_file_from_iso_fp(outfp, blocksize, iso_path, rr_path, - joliet_path) + joliet_path, encoding) def get_and_write(self, iso_path, local_path, blocksize=8192): # type: (str, str, int) -> None @@ -5450,6 +5472,7 @@ def list_children(self, **kwargs): rr_path - The absolute Rock Ridge path on the ISO to list the children for. joliet_path - The absolute Joliet path on the ISO to list the children for. udf_path - The absolute UDF path on the ISO to list the children for. + encoding - The string encoding used for the path; defaults to 'utf-8' or 'utf-16_be' Yields: Children of this path. Returns: @@ -5463,6 +5486,8 @@ def list_children(self, **kwargs): if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'): if value is not None: num_paths += 1 + elif key in ('encoding'): + continue else: raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'") @@ -5480,12 +5505,15 @@ def list_children(self, **kwargs): else: use_rr = False if 'joliet_path' in kwargs: - rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path'])) + kwargs['encoding'] = kwargs.get('encoding') or 'utf-16_be' + rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']), kwargs['encoding']) elif 'rr_path' in kwargs: - rec = self._get_rr_entry(utils.normpath(kwargs['rr_path'])) + kwargs['encoding'] = kwargs.get('encoding') or 'utf-8' + rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']), kwargs['encoding']) use_rr = True else: - rec = self._get_iso_entry(utils.normpath(kwargs['iso_path'])) + kwargs['encoding'] = kwargs.get('encoding') or 'utf-8' + rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']), kwargs['encoding']) for c in _yield_children(rec, use_rr): # pylint: disable=use-yield-from yield c @@ -5630,14 +5658,15 @@ def rm_isohybrid(self): self.isohybrid_mbr = None - def full_path_from_dirrecord(self, rec, rockridge=False): - # type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool) -> str + def full_path_from_dirrecord(self, rec, rockridge=False, user_encoding=''): + # type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool, str) -> str """ Get the absolute path of a directory record. Parameters: rec - The directory record to get the full path for. rockridge - Whether to get the rock ridge full path. + user_encoding - The string encoding used for the path as determined by the user. Returns: A string representing the absolute path to the file on the ISO. """ @@ -5650,6 +5679,9 @@ def full_path_from_dirrecord(self, rec, rockridge=False): if self.joliet_vd is not None and id(rec.vd) == id(self.joliet_vd): encoding = 'utf-16_be' + if user_encoding: + encoding = user_encoding + # A root entry has no Rock Ridge entry, even on a Rock Ridge ISO. # Always return / here. if rec.is_root: @@ -5689,6 +5721,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False): encoding = rec.file_ident.encoding else: encoding = 'utf-8' + if user_encoding: + encoding = user_encoding udf_rec = rec # type: Optional[udfmod.UDFFileEntry] while udf_rec is not None: ident = udf_rec.file_identifier() @@ -5859,12 +5893,11 @@ def walk(self, **kwargs): raise pycdlibexception.PyCdlibInvalidInput('This object is not initialized; call either open() or new() to create an ISO') num_paths = 0 - user_encoding = None + user_encoding = '' for key, value in kwargs.items(): - if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'): - if value is not None: - num_paths += 1 - elif key == 'encoding': + if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path') and value is not None: + num_paths += 1 + elif key == 'encoding' and value: user_encoding = value else: raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'") @@ -5901,23 +5934,22 @@ def walk(self, **kwargs): while dirs: dir_record = dirs.popleft() - relpath = self.full_path_from_dirrecord(dir_record, - rockridge=path_type == 'rr_path') + relpath = self.full_path_from_dirrecord(dir_record, rockridge=path_type == 'rr_path', + user_encoding=user_encoding) dirlist = [] filelist = [] dirdict = {} - for child in reversed(list(self.list_children(**{path_type: relpath}))): + for child in reversed(list(self.list_children(**{path_type: relpath, 'encoding': user_encoding or default_encoding}))): if child is None or child.is_dot() or child.is_dotdot(): continue - if user_encoding is not None: + if user_encoding != '': encoding = user_encoding + elif isinstance(child, udfmod.UDFFileEntry) and child.file_ident is not None: + encoding = child.file_ident.encoding else: - if isinstance(child, udfmod.UDFFileEntry) and child.file_ident is not None: - encoding = child.file_ident.encoding - else: - encoding = default_encoding + encoding = default_encoding or 'utf-8' if path_type == 'rr_path': name = child.rock_ridge.name()