Skip to content

Commit 7081e6f

Browse files
committed
Merge branch 'develop' into master
* develop: Update CHANGELOG.md [http] Re-introduce read optimization when buffer is empty (#907) [http] Eliminate _read_iter and simplify read (#906)
2 parents f835f58 + 79917ca commit 7081e6f

File tree

3 files changed

+70
-48
lines changed

3 files changed

+70
-48
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# 7.4.4, 2025-11-04
2+
3+
- [http] Eliminate _read_iter and simplify read (PR [#906](https://github.com/piskvorky/smart_open/pull/906), [@ddelange](https://github.com/ddelange))
4+
- [http] Re-introduce read optimization when buffer is empty (PR [#907](https://github.com/piskvorky/smart_open/pull/907), [@ddelange](https://github.com/ddelange))
5+
16
# 7.4.3, 2025-11-03
27

38
- Simplify WHENCE_END logic (PR [#902](https://github.com/piskvorky/smart_open/pull/902), [@ddelange](https://github.com/ddelange))

smart_open/http.py

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ def __init__(self, url, mode='r', buffer_size=DEFAULT_BUFFER_SIZE,
138138
if not self.response.ok:
139139
self.response.raise_for_status()
140140

141-
self._read_iter = iter(lambda: self.response.raw.read(self.buffer_size), b"")
142141
self._read_buffer = bytebuffer.ByteBuffer(buffer_size)
143142
self._current_pos = 0
144143

@@ -150,7 +149,6 @@ def close(self):
150149
logger.debug("close: called")
151150
if not self.closed:
152151
self.response = None
153-
self._read_iter = None
154152
self._read_buffer = None
155153

156154
@property
@@ -175,32 +173,23 @@ def read(self, size=-1):
175173
"""
176174
Mimics the read call to a filehandle object.
177175
"""
176+
if size < -1:
177+
raise ValueError(f'size must be >= -1, got {size}')
178+
178179
logger.debug("reading with size: %d", size)
179-
if self.response is None:
180-
return b''
181-
182-
if size == 0:
183-
return b''
184-
elif size < 0 and len(self._read_buffer) == 0:
185-
retval = self.response.raw.read()
186-
elif size < 0:
187-
retval = self._read_buffer.read() + self.response.raw.read()
180+
if self.closed or size == 0:
181+
return b""
182+
183+
if size == -1:
184+
if len(self._read_buffer):
185+
retval = self._read_buffer.read() + self.response.raw.read()
186+
else: # Avoid unnecessary +
187+
retval = self.response.raw.read()
188188
else:
189+
# Fill _read_buffer until it contains enough bytes
189190
while len(self._read_buffer) < size:
190-
logger.debug(
191-
"http reading more content at current_pos: %d with size: %d",
192-
self._current_pos, size,
193-
)
194-
bytes_read = self._read_buffer.fill(self._read_iter)
195-
if bytes_read == 0:
196-
# Oops, ran out of data early.
197-
retval = self._read_buffer.read()
198-
self._current_pos += len(retval)
199-
200-
return retval
201-
202-
# If we got here, it means we have enough data in the buffer
203-
# to return to the caller.
191+
if self._read_buffer.fill(self.response.raw) == 0:
192+
break # EOF reached
204193
retval = self._read_buffer.read(size)
205194

206195
self._current_pos += len(retval)
@@ -281,13 +270,11 @@ def seek(self, offset, whence=0):
281270

282271
if new_pos == self.content_length:
283272
self.response = None
284-
self._read_iter = None
285273
self._read_buffer.empty()
286274
else:
287275
response = self._partial_request(new_pos)
288276
if response.ok:
289277
self.response = response
290-
self._read_iter = iter(lambda: self.response.raw.read(self.buffer_size), b"")
291278
self._read_buffer.empty()
292279
else:
293280
self.response = None

tests/test_http.py

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -226,30 +226,60 @@ def callback(request):
226226

227227
responses.add_callback(responses.GET, URL, callback=callback)
228228
# Explicitly request gzip encoding
229-
reader = smart_open.http.SeekableBufferedInputBase(
230-
URL, headers={'Accept-Encoding': 'gzip'}
231-
)
229+
reader = smart_open.http.SeekableBufferedInputBase(URL, headers={'Accept-Encoding': 'gzip'})
232230
read_bytes = reader.read()
233231
assert read_bytes == BYTES # Should be decompressed by requests/urllib3
234232

233+
# Combining multiple read calls also works
234+
reader.seek(0)
235+
partial = reader.read(2) + reader.read(1000)
236+
assert partial == BYTES
237+
235238

236239
@responses.activate
237-
def test_gzip_consistency_between_read_methods():
238-
"""Regression test: read() and read(size) should return same decompressed data."""
239-
def callback(request):
240-
# Server returns gzipped data when client accepts it
241-
if 'gzip' in request.headers.get('Accept-Encoding', ''):
242-
headers = HEADERS.copy()
243-
headers['Content-Encoding'] = 'gzip'
244-
headers['Content-Length'] = str(len(GZIPPED_BYTES))
245-
return (200, headers, GZIPPED_BYTES)
246-
else:
247-
headers = HEADERS.copy()
248-
headers['Content-Length'] = str(len(BYTES))
249-
return (200, headers, BYTES)
240+
def test_read_after_read_to_eof():
241+
"""Reading after reading to EOF should return empty bytes."""
242+
responses.add_callback(responses.GET, URL, callback=request_callback)
243+
reader = smart_open.http.SeekableBufferedInputBase(URL)
250244

251-
responses.add_callback(responses.GET, URL, callback=callback)
252-
reader = smart_open.http.SeekableBufferedInputBase(URL, headers={'Accept-Encoding': 'gzip'})
253-
partial = reader.read(2) + reader.read()
254-
assert len(partial) == len(BYTES), f"Expected {len(BYTES)} bytes, got {len(partial)}"
255-
assert partial == BYTES
245+
# Read to EOF
246+
result = reader.read(-1)
247+
assert len(result) == len(BYTES)
248+
assert reader.tell() == len(BYTES)
249+
250+
# Read should return empty bytes
251+
result = reader.read()
252+
assert result == b""
253+
254+
# Read with size should also return empty bytes
255+
result = reader.read(10)
256+
assert result == b""
257+
258+
259+
@responses.activate
260+
def test_read_after_seek_to_eof():
261+
"""Reading after seeking to EOF should return empty bytes."""
262+
responses.add_callback(responses.GET, URL, callback=request_callback)
263+
reader = smart_open.http.SeekableBufferedInputBase(URL)
264+
265+
# Seek to EOF
266+
reader.seek(0, whence=smart_open.constants.WHENCE_END)
267+
assert reader.tell() == len(BYTES)
268+
269+
# Read should return empty bytes
270+
result = reader.read()
271+
assert result == b""
272+
273+
# Read with size should also return empty bytes
274+
result = reader.read(10)
275+
assert result == b""
276+
277+
278+
@responses.activate
279+
def test_read_with_invalid_size():
280+
"""Read with size < -1 should raise ValueError."""
281+
responses.add_callback(responses.GET, URL, callback=request_callback)
282+
reader = smart_open.http.SeekableBufferedInputBase(URL)
283+
284+
with pytest.raises(ValueError, match='size must be >= -1'):
285+
reader.read(-2)

0 commit comments

Comments
 (0)