Skip to content

Commit 7439719

Browse files
committed
Allow user to specify encoding
The code was always hardcoding utf-8 as an encoding which was producing wrong results for SHIFT-JIS (Japanese) file names. Thus I have added an optional argument for encoding which by default is set to utf-8 but can be specified to any other value if encoding is not utf-8
1 parent 9ac6571 commit 7439719

File tree

1 file changed

+48
-36
lines changed

1 file changed

+48
-36
lines changed

pycdlib/pycdlib.py

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,6 @@ def _find_dr_record_by_name(vd, path, encoding):
495495
return root_dir_record
496496

497497
splitpath = utils.split_path(path)
498-
499498
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)
500499

501500
entry = root_dir_record
@@ -518,7 +517,6 @@ def _find_dr_record_by_name(vd, path, encoding):
518517
index = lo
519518
if index != len(thelist) and thelist[index].file_ident == currpath:
520519
child = thelist[index]
521-
522520
if child is None:
523521
# We failed to find this component of the path, so break out of the
524522
# loop and fail.
@@ -533,7 +531,6 @@ def _find_dr_record_by_name(vd, path, encoding):
533531
# We found the last child we are looking for; return it.
534532
if not splitpath:
535533
return child
536-
537534
if not child.is_dir():
538535
break
539536
entry = child
@@ -718,8 +715,8 @@ def _seek_to_extent(self, extent):
718715
self._cdfp.seek(extent * self.logical_block_size)
719716

720717
@lru_cache(maxsize=256)
721-
def _find_iso_record(self, iso_path):
722-
# type: (bytes) -> dr.DirectoryRecord
718+
def _find_iso_record(self, iso_path, encoding='utf-8'):
719+
# type: (bytes, str) -> dr.DirectoryRecord
723720
"""
724721
An internal method to find a directory record on the ISO given an ISO
725722
path. If the entry is found, it returns the directory record object
@@ -731,11 +728,11 @@ def _find_iso_record(self, iso_path):
731728
Returns:
732729
The directory record entry representing the entry on the ISO.
733730
"""
734-
return _find_dr_record_by_name(self.pvd, iso_path, 'utf-8')
731+
return _find_dr_record_by_name(self.pvd, iso_path, encoding)
735732

736733
@lru_cache(maxsize=256)
737-
def _find_rr_record(self, rr_path):
738-
# type: (bytes) -> dr.DirectoryRecord
734+
def _find_rr_record(self, rr_path, encoding='utf-8'):
735+
# type: (bytes, str) -> dr.DirectoryRecord
739736
"""
740737
An internal method to find a directory record on the ISO given a Rock
741738
Ridge path. If the entry is found, it returns the directory record
@@ -755,7 +752,7 @@ def _find_rr_record(self, rr_path):
755752

756753
splitpath = utils.split_path(rr_path)
757754

758-
currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
755+
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)
759756

760757
entry = root_dir_record
761758

@@ -806,13 +803,13 @@ def _find_rr_record(self, rr_path):
806803
if not child.is_dir():
807804
break
808805
entry = child
809-
currpath = splitpath.pop(0).decode('utf-8').encode('utf-8')
806+
currpath = splitpath.pop(0).decode('utf-8').encode(encoding)
810807

811808
raise pycdlibexception.PyCdlibInvalidInput('Could not find path')
812809

813810
@lru_cache(maxsize=256)
814-
def _find_joliet_record(self, joliet_path):
815-
# type: (bytes) -> dr.DirectoryRecord
811+
def _find_joliet_record(self, joliet_path, encoding='utf-16_be'):
812+
# type: (bytes, str) -> dr.DirectoryRecord
816813
"""
817814
An internal method to find a directory record on the ISO given a Joliet
818815
path. If the entry is found, it returns the directory record object
@@ -826,7 +823,7 @@ def _find_joliet_record(self, joliet_path):
826823
"""
827824
if self.joliet_vd is None:
828825
raise pycdlibexception.PyCdlibInternalError('Joliet path requested on non-Joliet ISO')
829-
return _find_dr_record_by_name(self.joliet_vd, joliet_path, 'utf-16_be')
826+
return _find_dr_record_by_name(self.joliet_vd, joliet_path, encoding)
830827

831828
@lru_cache(maxsize=256)
832829
def _find_udf_record(self, udf_path):
@@ -2425,8 +2422,8 @@ def _udf_get_file_from_iso_fp(self, outfp, blocksize, udf_path):
24252422
utils.copy_data(data_len, blocksize, data_fp, outfp)
24262423

24272424
def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
2428-
joliet_path):
2429-
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes]) -> None
2425+
joliet_path, encoding=None):
2426+
# type: (BinaryIO, int, Optional[bytes], Optional[bytes], Optional[bytes], str) -> None
24302427
"""
24312428
An internal method to fetch a single file from the ISO and write it out
24322429
to the file object.
@@ -2446,13 +2443,16 @@ def _get_file_from_iso_fp(self, outfp, blocksize, iso_path, rr_path,
24462443
if joliet_path is not None:
24472444
if self.joliet_vd is None:
24482445
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a joliet_path from a non-Joliet ISO')
2449-
found_record = self._find_joliet_record(joliet_path)
2446+
encoding = encoding or 'utf-16_be'
2447+
found_record = self._find_joliet_record(joliet_path, encoding)
24502448
elif rr_path is not None:
24512449
if not self.rock_ridge:
24522450
raise pycdlibexception.PyCdlibInvalidInput('Cannot fetch a rr_path from a non-Rock Ridge ISO')
2453-
found_record = self._find_rr_record(rr_path)
2451+
encoding = encoding or 'utf-8'
2452+
found_record = self._find_rr_record(rr_path, encoding)
24542453
elif iso_path is not None:
2455-
found_record = self._find_iso_record(iso_path)
2454+
encoding = encoding or 'utf-8'
2455+
found_record = self._find_iso_record(iso_path, encoding)
24562456
else:
24572457
raise pycdlibexception.PyCdlibInternalError('Invalid path passed to get_file_from_iso_fp')
24582458

@@ -3487,8 +3487,8 @@ def _rm_joliet_dir(self, joliet_path):
34873487

34883488
return num_bytes_to_remove
34893489

3490-
def _get_iso_entry(self, iso_path):
3491-
# type: (bytes) -> dr.DirectoryRecord
3490+
def _get_iso_entry(self, iso_path, encoding='utf-8'):
3491+
# type: (bytes, str) -> dr.DirectoryRecord
34923492
"""
34933493
Internal method to get the directory record for an ISO path.
34943494
@@ -3500,10 +3500,10 @@ def _get_iso_entry(self, iso_path):
35003500
if self._needs_reshuffle:
35013501
self._reshuffle_extents()
35023502

3503-
return self._find_iso_record(iso_path)
3503+
return self._find_iso_record(iso_path, encoding)
35043504

3505-
def _get_rr_entry(self, rr_path):
3506-
# type: (bytes) -> dr.DirectoryRecord
3505+
def _get_rr_entry(self, rr_path, encoding='utf-8'):
3506+
# type: (bytes, str) -> dr.DirectoryRecord
35073507
"""
35083508
Internal method to get the directory record for a Rock Ridge path.
35093509
@@ -3516,10 +3516,10 @@ def _get_rr_entry(self, rr_path):
35163516
if self._needs_reshuffle:
35173517
self._reshuffle_extents()
35183518

3519-
return self._find_rr_record(rr_path)
3519+
return self._find_rr_record(rr_path, encoding)
35203520

3521-
def _get_joliet_entry(self, joliet_path):
3522-
# type: (bytes) -> dr.DirectoryRecord
3521+
def _get_joliet_entry(self, joliet_path, encoding='utf-16_be'):
3522+
# type: (bytes, str) -> dr.DirectoryRecord
35233523
"""
35243524
Internal method to get the directory record for a Joliet path.
35253525
@@ -3532,7 +3532,7 @@ def _get_joliet_entry(self, joliet_path):
35323532
if self._needs_reshuffle:
35333533
self._reshuffle_extents()
35343534

3535-
return self._find_joliet_record(joliet_path)
3535+
return self._find_joliet_record(joliet_path, encoding)
35363536

35373537
def _get_udf_entry(self, udf_path):
35383538
# type: (str) -> udfmod.UDFFileEntry
@@ -4199,6 +4199,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
41994199
iso_path = None
42004200
rr_path = None
42014201
udf_path = None
4202+
encoding = None
42024203
num_paths = 0
42034204
for key, value in kwargs.items():
42044205
if key == 'blocksize':
@@ -4229,6 +4230,8 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
42294230
num_paths += 1
42304231
elif value is not None:
42314232
raise pycdlibexception.PyCdlibInvalidInput('udf_path must be a string')
4233+
elif key == 'encoding':
4234+
encoding = value
42324235
else:
42334236
raise pycdlibexception.PyCdlibInvalidInput('Unknown keyword %s' % (key))
42344237

@@ -4239,7 +4242,7 @@ def get_file_from_iso_fp(self, outfp, **kwargs):
42394242
self._udf_get_file_from_iso_fp(outfp, blocksize, udf_path)
42404243
else:
42414244
self._get_file_from_iso_fp(outfp, blocksize, iso_path, rr_path,
4242-
joliet_path)
4245+
joliet_path, encoding)
42434246

42444247
def get_and_write(self, iso_path, local_path, blocksize=8192):
42454248
# type: (str, str, int) -> None
@@ -5475,6 +5478,8 @@ def list_children(self, **kwargs):
54755478
if key in ('joliet_path', 'rr_path', 'iso_path', 'udf_path'):
54765479
if value is not None:
54775480
num_paths += 1
5481+
elif key in ('encoding'):
5482+
continue
54785483
else:
54795484
raise pycdlibexception.PyCdlibInvalidInput("Invalid keyword, must be one of 'iso_path', 'rr_path', 'joliet_path', or 'udf_path'")
54805485

@@ -5492,12 +5497,15 @@ def list_children(self, **kwargs):
54925497
else:
54935498
use_rr = False
54945499
if 'joliet_path' in kwargs:
5495-
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']))
5500+
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-16_be'
5501+
rec = self._get_joliet_entry(self._normalize_joliet_path(kwargs['joliet_path']), kwargs['encoding'])
54965502
elif 'rr_path' in kwargs:
5497-
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']))
5503+
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-8'
5504+
rec = self._get_rr_entry(utils.normpath(kwargs['rr_path']), kwargs['encoding'])
54985505
use_rr = True
54995506
else:
5500-
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']))
5507+
kwargs['encoding'] = kwargs.get('encoding', None) or 'utf-8'
5508+
rec = self._get_iso_entry(utils.normpath(kwargs['iso_path']), kwargs['encoding'])
55015509

55025510
for c in _yield_children(rec, use_rr):
55035511
yield c
@@ -5642,8 +5650,8 @@ def rm_isohybrid(self):
56425650

56435651
self.isohybrid_mbr = None
56445652

5645-
def full_path_from_dirrecord(self, rec, rockridge=False):
5646-
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool) -> str
5653+
def full_path_from_dirrecord(self, rec, rockridge=False, user_encoding=None):
5654+
# type: (Union[dr.DirectoryRecord, udfmod.UDFFileEntry], bool, str) -> str
56475655
"""
56485656
Get the absolute path of a directory record.
56495657
@@ -5662,6 +5670,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
56625670
if self.joliet_vd is not None and id(rec.vd) == id(self.joliet_vd):
56635671
encoding = 'utf-16_be'
56645672

5673+
if user_encoding:
5674+
encoding = user_encoding
56655675
# A root entry has no Rock Ridge entry, even on a Rock Ridge ISO.
56665676
# Always return / here.
56675677
if rec.is_root:
@@ -5701,6 +5711,8 @@ def full_path_from_dirrecord(self, rec, rockridge=False):
57015711
encoding = rec.file_ident.encoding
57025712
else:
57035713
encoding = 'utf-8'
5714+
if user_encoding:
5715+
encoding = user_encoding
57045716
udf_rec = rec # type: Optional[udfmod.UDFFileEntry]
57055717
while udf_rec is not None:
57065718
ident = udf_rec.file_identifier()
@@ -5913,13 +5925,13 @@ def walk(self, **kwargs):
59135925
while dirs:
59145926
dir_record = dirs.popleft()
59155927

5916-
relpath = self.full_path_from_dirrecord(dir_record,
5917-
rockridge=path_type == 'rr_path')
5928+
relpath = self.full_path_from_dirrecord(dir_record, rockridge=path_type == 'rr_path',
5929+
user_encoding=user_encoding)
59185930
dirlist = []
59195931
filelist = []
59205932
dirdict = {}
59215933

5922-
for child in reversed(list(self.list_children(**{path_type: relpath}))):
5934+
for child in reversed(list(self.list_children(**{path_type: relpath, 'encoding': kwargs.get('encoding', None)}))):
59235935
if child is None or child.is_dot() or child.is_dotdot():
59245936
continue
59255937

0 commit comments

Comments
 (0)