databendlabs
diff --git a/‎src/query/service/src/table_functions/infer_schema/infer_schema_table.rs‎
Lines changed: 1 addition & 3 deletions b/‎src/query/service/src/table_functions/infer_schema/infer_schema_table.rs‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/query/service/src/table_functions/infer_schema/separator.rs‎
Lines changed: 27 additions & 0 deletions b/‎src/query/service/src/table_functions/infer_schema/separator.rs‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/query/service/src/table_functions/infer_schema/table_args.rs‎
Lines changed: 0 additions & 6 deletions b/‎src/query/service/src/table_functions/infer_schema/table_args.rs‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎tests/data/csv/max_file_count/numbers0.csv‎
Lines changed: 0 additions & 5 deletions b/‎tests/data/csv/max_file_count/numbers0.csv‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎tests/data/csv/max_file_count/numbers1.csv‎
Lines changed: 0 additions & 4 deletions b/‎tests/data/csv/max_file_count/numbers1.csv‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tests/data/csv/max_file_count/numbers2.csv‎
Lines changed: 0 additions & 4 deletions b/‎tests/data/csv/max_file_count/numbers2.csv‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test‎
Lines changed: 5 additions & 14 deletions b/‎tests/sqllogictests/suites/stage/formats/parquet/infer_schema.test‎
Lines changed: 5 additions & 14 deletions
@@ -189,9 +189,7 @@ impl Table for InferSchemaTable {
             None => stage_info.file_format_params.clone(),
         };
         let operator = init_stage_operator(&stage_info)?;
-        let stage_file_infos = files_info
-            .list(&operator, 1, self.args_parsed.max_file_count)
-            .await?;
+        let stage_file_infos = files_info.list(&operator, 1, None).await?;
         Ok((
             PartStatistics::default(),
             Partitions::create(PartitionsShuffleKind::Seq, vec![
 
@@ -35,6 +35,8 @@ use databend_common_storages_stage::BytesBatch;
 
 use crate::table_functions::infer_schema::merge::merge_schema;
 
+const MAX_SINGLE_FILE_BYTES: usize = 100 * 1024 * 1024;
+
 pub struct InferSchemaSeparator {
     pub file_format_params: FileFormatParams,
     files: HashMap<String, Vec<u8>>,
@@ -76,6 +78,14 @@ impl AccumulatingTransform for InferSchemaSeparator {
         let bytes = self.files.entry(batch.path.clone()).or_default();
         bytes.extend(batch.data);
 
+        if bytes.len() > MAX_SINGLE_FILE_BYTES {
+            return Err(ErrorCode::InvalidArgument(format!(
+                "The file '{}' is too large(maximum allowed: {})",
+                batch.path,
+                human_readable_size(MAX_SINGLE_FILE_BYTES),
+            )));
+        }
+
         // When max_records exists, it will try to use the current bytes to read, otherwise it will buffer all bytes
         if self.max_records.is_none() && !batch.is_eof {
             return Ok(vec![DataBlock::empty()]);
@@ -177,3 +187,20 @@ impl AccumulatingTransform for InferSchemaSeparator {
         Ok(vec![block])
     }
 }
+
+fn human_readable_size(bytes: usize) -> String {
+    const KB: f64 = 1024.0;
+    const MB: f64 = KB * 1024.0;
+    const GB: f64 = MB * 1024.0;
+
+    let b = bytes as f64;
+    if b >= GB {
+        format!("{:.2} GB", b / GB)
+    } else if b >= MB {
+        format!("{:.2} MB", b / MB)
+    } else if b >= KB {
+        format!("{:.2} KB", b / KB)
+    } else {
+        format!("{} B", bytes)
+    }
+}
@@ -26,7 +26,6 @@ pub(crate) struct InferSchemaArgsParsed {
     pub(crate) file_format: Option<String>,
     pub(crate) files_info: StageFilesInfo,
     pub(crate) max_records: Option<usize>,
-    pub(crate) max_file_count: Option<usize>,
 }
 
 impl InferSchemaArgsParsed {
@@ -42,7 +41,6 @@ impl InferSchemaArgsParsed {
             pattern: None,
         };
         let mut max_records = None;
-        let mut max_file_count = None;
 
         for (k, v) in &args {
             match k.to_lowercase().as_str() {
@@ -61,9 +59,6 @@ impl InferSchemaArgsParsed {
                 "max_records_pre_file" => {
                     max_records = Some(i64_value(v)? as usize);
                 }
-                "max_file_count" => {
-                    max_file_count = Some(i64_value(v)? as usize);
-                }
                 _ => {
                     return Err(ErrorCode::BadArguments(format!(
                         "unknown param {} for infer_schema",
@@ -82,7 +77,6 @@ impl InferSchemaArgsParsed {
             file_format,
             files_info,
             max_records,
-            max_file_count,
         })
     }
 }
@@ -61,11 +61,11 @@ drop CONNECTION IF EXISTS my_conn
 statement ok
 create CONNECTION my_conn STORAGE_TYPE = 's3' access_key_id='minioadmin' secret_access_key='minioadmin' endpoint_url='http://127.0.0.1:9900/' region='auto'
 
-query
-select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
-----
-id INT 0 0
-t TUPLE(A INT32, B STRING) 0 1
+# query
+# select * from INFER_SCHEMA(location => 's3://testbucket/data/parquet/tuple.parquet', connection_name => 'my_conn')
+# ----
+# id INT 0 0
+# t TUPLE(A INT32, B STRING) 0 1
 
 # CSV
 statement ok
@@ -144,15 +144,6 @@ col3 VARCHAR 1 2
 col4 VARCHAR 1 3
 col5 VARCHAR 1 4
 
-query TTBI
-select * from infer_schema(location => '@data/csv/max_file_count/', file_format => 'head_csv_format', max_file_count => 2);
-----
-col1 BIGINT 1 0
-col2 BIGINT 1 1
-col3 BIGINT 1 2
-col4 BIGINT 1 3
-col5 BIGINT 1 4
-
 # NDJSON
 query TTBI
 select * from infer_schema(location => '@data/ndjson/numbers.ndjson', file_format => 'NDJSON');