13
13
// limitations under the License.
14
14
15
15
use std:: path:: Path ;
16
+ use std:: pin:: Pin ;
17
+ use std:: sync:: Arc ;
16
18
17
19
use chrono:: DateTime ;
18
20
use chrono:: Utc ;
@@ -22,6 +24,10 @@ use databend_common_exception::Result;
22
24
use databend_common_meta_app:: principal:: StageInfo ;
23
25
use databend_common_meta_app:: principal:: StageType ;
24
26
use databend_common_meta_app:: principal:: UserIdentity ;
27
+ use databend_common_meta_app:: principal:: COPY_MAX_FILES_PER_COMMIT ;
28
+ use futures:: stream;
29
+ use futures:: Stream ;
30
+ use futures:: StreamExt ;
25
31
use futures:: TryStreamExt ;
26
32
use opendal:: EntryMode ;
27
33
use opendal:: Metadata ;
@@ -103,6 +109,34 @@ impl StageFilesInfo {
103
109
}
104
110
}
105
111
112
+ #[ async_backtrace:: framed]
113
+ async fn list_files (
114
+ & self ,
115
+ operator : & Operator ,
116
+ thread_num : usize ,
117
+ max_files : Option < usize > ,
118
+ mut files : & [ String ] ,
119
+ ) -> Result < Vec < StageFileInfo > > {
120
+ if let Some ( m) = max_files {
121
+ files = & files[ ..m]
122
+ }
123
+ let file_infos = self . stat_concurrent ( operator, thread_num, files) . await ?;
124
+ let mut res = Vec :: with_capacity ( file_infos. len ( ) ) ;
125
+
126
+ for file_info in file_infos {
127
+ match file_info {
128
+ Ok ( ( path, meta) ) if meta. is_dir ( ) => {
129
+ return Err ( ErrorCode :: BadArguments ( format ! ( "{path} is not a file" ) ) ) ;
130
+ }
131
+ Ok ( ( path, meta) ) => res. push ( StageFileInfo :: new ( path, & meta) ) ,
132
+ Err ( e) => {
133
+ return Err ( e) ;
134
+ }
135
+ }
136
+ }
137
+ Ok ( res)
138
+ }
139
+
106
140
#[ async_backtrace:: framed]
107
141
pub async fn list (
108
142
& self ,
@@ -111,31 +145,45 @@ impl StageFilesInfo {
111
145
max_files : Option < usize > ,
112
146
) -> Result < Vec < StageFileInfo > > {
113
147
if self . path == STDIN_FD {
114
- return Ok ( vec ! [ stdin_stage_info( ) ? ] ) ;
148
+ return Ok ( vec ! [ stdin_stage_info( ) ] ) ;
115
149
}
116
150
117
- let max_files = max_files. unwrap_or ( usize:: MAX ) ;
118
151
if let Some ( files) = & self . files {
119
- let file_infos = self
120
- . stat_concurrent ( operator, thread_num, max_files, files)
121
- . await ?;
122
- let mut res = Vec :: with_capacity ( file_infos. len ( ) ) ;
152
+ self . list_files ( operator, thread_num, max_files, files)
153
+ . await
154
+ } else {
155
+ let pattern = self . get_pattern ( ) ?;
156
+ StageFilesInfo :: list_files_with_pattern (
157
+ operator,
158
+ & self . path ,
159
+ pattern,
160
+ max_files. unwrap_or ( COPY_MAX_FILES_PER_COMMIT ) ,
161
+ )
162
+ . await
163
+ }
164
+ }
123
165
124
- for file_info in file_infos {
125
- match file_info {
126
- Ok ( ( path, meta) ) if meta. is_dir ( ) => {
127
- return Err ( ErrorCode :: BadArguments ( format ! ( "{path} is not a file" ) ) ) ;
128
- }
129
- Ok ( ( path, meta) ) => res. push ( StageFileInfo :: new ( path, & meta) ) ,
130
- Err ( e) => {
131
- return Err ( e) ;
132
- }
133
- }
134
- }
135
- Ok ( res)
166
+ #[ async_backtrace:: framed]
167
+ pub async fn list_stream (
168
+ & self ,
169
+ operator : & Operator ,
170
+ thread_num : usize ,
171
+ max_files : Option < usize > ,
172
+ ) -> Result < Pin < Box < dyn Stream < Item = Result < StageFileInfo > > + Send > > > {
173
+ if self . path == STDIN_FD {
174
+ return Ok ( Box :: pin ( stream:: iter ( vec ! [ Ok ( stdin_stage_info( ) ) ] ) ) ) ;
175
+ }
176
+
177
+ if let Some ( files) = & self . files {
178
+ let files = self
179
+ . list_files ( operator, thread_num, max_files, files)
180
+ . await ?;
181
+ let files = files. into_iter ( ) . map ( Ok ) ;
182
+ Ok ( Box :: pin ( stream:: iter ( files) ) )
136
183
} else {
137
184
let pattern = self . get_pattern ( ) ?;
138
- StageFilesInfo :: list_files_with_pattern ( operator, & self . path , pattern, max_files) . await
185
+ StageFilesInfo :: list_files_stream_with_pattern ( operator, & self . path , pattern, max_files)
186
+ . await
139
187
}
140
188
}
141
189
@@ -195,40 +243,73 @@ impl StageFilesInfo {
195
243
pattern : Option < Regex > ,
196
244
max_files : usize ,
197
245
) -> Result < Vec < StageFileInfo > > {
246
+ Self :: list_files_stream_with_pattern ( operator, path, pattern, Some ( max_files) )
247
+ . await ?
248
+ . try_collect :: < Vec < _ > > ( )
249
+ . await
250
+ }
251
+
252
+ #[ async_backtrace:: framed]
253
+ pub async fn list_files_stream_with_pattern (
254
+ operator : & Operator ,
255
+ path : & str ,
256
+ pattern : Option < Regex > ,
257
+ max_files : Option < usize > ,
258
+ ) -> Result < Pin < Box < dyn Stream < Item = Result < StageFileInfo > > + Send > > > {
198
259
if path == STDIN_FD {
199
- return Ok ( vec ! [ stdin_stage_info( ) ? ] ) ;
260
+ return Ok ( Box :: pin ( stream :: once ( async { Ok ( stdin_stage_info ( ) ) } ) ) ) ;
200
261
}
201
- let mut files = Vec :: new ( ) ;
202
262
let prefix_len = if path == "/" { 0 } else { path. len ( ) } ;
203
263
let prefix_meta = operator. stat ( path) . await ;
204
- match prefix_meta {
264
+ let file_exact : Option < Result < StageFileInfo > > = match prefix_meta {
205
265
Ok ( meta) if meta. is_file ( ) => {
206
- files. push ( StageFileInfo :: new ( path. to_string ( ) , & meta) ) ;
266
+ let f = StageFileInfo :: new ( path. to_string ( ) , & meta) ;
267
+ if max_files == Some ( 1 ) {
268
+ return Ok ( Box :: pin ( stream:: once ( async { Ok ( f) } ) ) ) ;
269
+ }
270
+ Some ( Ok ( f) )
207
271
}
208
272
Err ( e) if e. kind ( ) != opendal:: ErrorKind :: NotFound => {
209
273
return Err ( e. into ( ) ) ;
210
274
}
211
- _ => { }
275
+ _ => None ,
212
276
} ;
213
- let mut lister = operator
277
+ let file_exact_stream = stream:: iter ( file_exact. clone ( ) . into_iter ( ) ) ;
278
+
279
+ let lister = operator
214
280
. lister_with ( path)
215
281
. recursive ( true )
216
282
. metakey ( StageFileInfo :: meta_query ( ) )
217
283
. await ?;
218
284
219
- if files. len ( ) == max_files {
220
- return Ok ( files) ;
221
- }
222
- while let Some ( obj) = lister. try_next ( ) . await ? {
223
- let meta = obj. metadata ( ) ;
224
- if check_file ( & obj. path ( ) [ prefix_len..] , meta. mode ( ) , & pattern) {
225
- files. push ( StageFileInfo :: new ( obj. path ( ) . to_string ( ) , meta) ) ;
226
- if files. len ( ) == max_files {
227
- return Ok ( files) ;
285
+ let pattern = Arc :: new ( pattern) ;
286
+ let files_with_prefix = lister. filter_map ( move |result| {
287
+ let pattern = pattern. clone ( ) ;
288
+ async move {
289
+ match result {
290
+ Ok ( entry) => {
291
+ let meta = entry. metadata ( ) ;
292
+ if check_file ( & entry. path ( ) [ prefix_len..] , meta. mode ( ) , & pattern) {
293
+ Some ( Ok ( StageFileInfo :: new ( entry. path ( ) . to_string ( ) , meta) ) )
294
+ } else {
295
+ None
296
+ }
297
+ }
298
+ Err ( e) => Some ( Err ( ErrorCode :: from ( e) ) ) ,
228
299
}
229
300
}
301
+ } ) ;
302
+ if let Some ( max_files) = max_files {
303
+ if file_exact. is_some ( ) {
304
+ Ok ( Box :: pin (
305
+ file_exact_stream. chain ( files_with_prefix. take ( max_files - 1 ) ) ,
306
+ ) )
307
+ } else {
308
+ Ok ( Box :: pin ( files_with_prefix. take ( max_files) ) )
309
+ }
310
+ } else {
311
+ Ok ( Box :: pin ( file_exact_stream. chain ( files_with_prefix) ) )
230
312
}
231
- Ok ( files)
232
313
}
233
314
234
315
/// Stat files concurrently.
@@ -237,10 +318,9 @@ impl StageFilesInfo {
237
318
& self ,
238
319
operator : & Operator ,
239
320
thread_num : usize ,
240
- max_files : usize ,
241
321
files : & [ String ] ,
242
322
) -> Result < Vec < Result < ( String , Metadata ) > > > {
243
- if max_files == 1 {
323
+ if files . len ( ) == 1 {
244
324
let Some ( file) = files. first ( ) else {
245
325
return Ok ( vec ! [ ] ) ;
246
326
} ;
@@ -254,7 +334,7 @@ impl StageFilesInfo {
254
334
}
255
335
256
336
// This clone is required to make sure we are not referring to `file: &String` in the closure
257
- let tasks = files. iter ( ) . take ( max_files ) . cloned ( ) . map ( |file| {
337
+ let tasks = files. iter ( ) . cloned ( ) . map ( |file| {
258
338
let full_path = Path :: new ( & self . path )
259
339
. join ( file)
260
340
. to_string_lossy ( )
@@ -292,7 +372,7 @@ fn blocking_list_files_with_pattern(
292
372
max_files : usize ,
293
373
) -> Result < Vec < StageFileInfo > > {
294
374
if path == STDIN_FD {
295
- return Ok ( vec ! [ stdin_stage_info( ) ? ] ) ;
375
+ return Ok ( vec ! [ stdin_stage_info( ) ] ) ;
296
376
}
297
377
let operator = operator. blocking ( ) ;
298
378
let mut files = Vec :: new ( ) ;
@@ -330,14 +410,14 @@ fn blocking_list_files_with_pattern(
330
410
331
411
pub const STDIN_FD : & str = "/dev/fd/0" ;
332
412
333
- fn stdin_stage_info ( ) -> Result < StageFileInfo > {
334
- Ok ( StageFileInfo {
413
+ fn stdin_stage_info ( ) -> StageFileInfo {
414
+ StageFileInfo {
335
415
path : STDIN_FD . to_string ( ) ,
336
416
size : u64:: MAX ,
337
417
md5 : None ,
338
418
last_modified : Utc :: now ( ) ,
339
419
etag : None ,
340
420
status : StageFileStatus :: NeedCopy ,
341
421
creator : None ,
342
- } )
422
+ }
343
423
}
0 commit comments