Skip to content

Commit 0a45ff1

Browse files
authored
Merge pull request #99 from All-Hands-AI/better-coding-detection
Use `charset-normalizer` for better coding detection
2 parents bebd679 + 2036648 commit 0a45ff1

File tree

4 files changed

+125
-132
lines changed

4 files changed

+125
-132
lines changed

openhands_aci/editor/encoding.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from pathlib import Path
66
from typing import Tuple
77

8-
import chardet
8+
import charset_normalizer
99
from cachetools import LRUCache
1010

1111

@@ -24,14 +24,12 @@ def __init__(self, max_cache_size=None):
2424
# Default fallback encoding
2525
self.default_encoding = 'utf-8'
2626
# Confidence threshold for encoding detection
27-
self.confidence_threshold = 0.7
27+
self.confidence_threshold = 0.9
2828

2929
def detect_encoding(self, path: Path) -> str:
3030
"""Detect the encoding of a file without handling caching logic.
31-
3231
Args:
3332
path: Path to the file
34-
3533
Returns:
3634
The detected encoding or default encoding if detection fails
3735
"""
@@ -44,28 +42,25 @@ def detect_encoding(self, path: Path) -> str:
4442
with open(path, 'rb') as f:
4543
raw_data = f.read(sample_size)
4644

47-
result = chardet.detect(raw_data)
45+
# Use charset_normalizer instead of chardet
46+
results = charset_normalizer.detect(raw_data)
4847

49-
# Use detected encoding if confidence is high enough, otherwise fallback
50-
encoding = (
51-
result['encoding']
52-
if (result['encoding'] and result['confidence'] > self.confidence_threshold)
53-
else self.default_encoding
54-
)
48+
# Get the best match if any exists
49+
if results and results['confidence'] > self.confidence_threshold:
50+
encoding = results['encoding']
51+
else:
52+
encoding = self.default_encoding
5553

5654
return encoding
5755

5856
def get_encoding(self, path: Path) -> str:
5957
"""Get encoding for a file, using cache or detecting if necessary.
60-
6158
Args:
6259
path: Path to the file
63-
6460
Returns:
6561
The encoding for the file
6662
"""
6763
path_str = str(path)
68-
6964
# If file doesn't exist, return default encoding
7065
if not path.exists():
7166
return self.default_encoding
@@ -89,13 +84,10 @@ def get_encoding(self, path: Path) -> str:
8984

9085
def with_encoding(method):
9186
"""Decorator to handle file encoding for file operations.
92-
9387
This decorator automatically detects and applies the correct encoding
9488
for file operations, ensuring consistency between read and write operations.
95-
9689
Args:
9790
method: The method to decorate
98-
9991
Returns:
10092
The decorated method
10193
"""
@@ -114,7 +106,6 @@ def wrapper(self, path: Path, *args, **kwargs):
114106
else:
115107
# Get encoding from the encoding manager for existing files
116108
encoding = self._encoding_manager.get_encoding(path)
117-
118109
# Add encoding to kwargs if the method accepts it
119110
if 'encoding' not in kwargs:
120111
kwargs['encoding'] = encoding

0 commit comments

Comments
 (0)