5
5
from pathlib import Path
6
6
from typing import Tuple
7
7
8
- import chardet
8
+ import charset_normalizer
9
9
from cachetools import LRUCache
10
10
11
11
@@ -24,14 +24,12 @@ def __init__(self, max_cache_size=None):
24
24
# Default fallback encoding
25
25
self .default_encoding = 'utf-8'
26
26
# Confidence threshold for encoding detection
27
- self .confidence_threshold = 0.7
27
+ self .confidence_threshold = 0.9
28
28
29
29
def detect_encoding (self , path : Path ) -> str :
30
30
"""Detect the encoding of a file without handling caching logic.
31
-
32
31
Args:
33
32
path: Path to the file
34
-
35
33
Returns:
36
34
The detected encoding or default encoding if detection fails
37
35
"""
@@ -44,28 +42,25 @@ def detect_encoding(self, path: Path) -> str:
44
42
with open (path , 'rb' ) as f :
45
43
raw_data = f .read (sample_size )
46
44
47
- result = chardet .detect (raw_data )
45
+ # Use charset_normalizer instead of chardet
46
+ results = charset_normalizer .detect (raw_data )
48
47
49
- # Use detected encoding if confidence is high enough, otherwise fallback
50
- encoding = (
51
- result ['encoding' ]
52
- if (result ['encoding' ] and result ['confidence' ] > self .confidence_threshold )
53
- else self .default_encoding
54
- )
48
+ # Get the best match if any exists
49
+ if results and results ['confidence' ] > self .confidence_threshold :
50
+ encoding = results ['encoding' ]
51
+ else :
52
+ encoding = self .default_encoding
55
53
56
54
return encoding
57
55
58
56
def get_encoding (self , path : Path ) -> str :
59
57
"""Get encoding for a file, using cache or detecting if necessary.
60
-
61
58
Args:
62
59
path: Path to the file
63
-
64
60
Returns:
65
61
The encoding for the file
66
62
"""
67
63
path_str = str (path )
68
-
69
64
# If file doesn't exist, return default encoding
70
65
if not path .exists ():
71
66
return self .default_encoding
@@ -89,13 +84,10 @@ def get_encoding(self, path: Path) -> str:
89
84
90
85
def with_encoding (method ):
91
86
"""Decorator to handle file encoding for file operations.
92
-
93
87
This decorator automatically detects and applies the correct encoding
94
88
for file operations, ensuring consistency between read and write operations.
95
-
96
89
Args:
97
90
method: The method to decorate
98
-
99
91
Returns:
100
92
The decorated method
101
93
"""
@@ -114,7 +106,6 @@ def wrapper(self, path: Path, *args, **kwargs):
114
106
else :
115
107
# Get encoding from the encoding manager for existing files
116
108
encoding = self ._encoding_manager .get_encoding (path )
117
-
118
109
# Add encoding to kwargs if the method accepts it
119
110
if 'encoding' not in kwargs :
120
111
kwargs ['encoding' ] = encoding
0 commit comments