fix: load of large zip file. (#19010)

youngsofun · web-flow · commit 3c6a4fe547fa · 2025-11-24T09:49:01.000+08:00
* fix load of file compressed by zip.
diff --git a/.gitignore b/.gitignore
@@ -69,6 +69,7 @@ __pycache__/
 .python-version
 
 *.zip
+!tests/data/ontime_200.csv.zip
 *.profraw
 
 # tpch data set
diff --git a/src/common/compress/src/decode.rs b/src/common/compress/src/decode.rs
@@ -357,21 +357,38 @@ impl DecompressDecoder {
         Ok(main)
     }
 
-    pub fn decompress_all_zip(compressed: &[u8]) -> databend_common_exception::Result<Vec<u8>> {
+    pub fn decompress_all_zip(
+        compressed: &[u8],
+        path: &str,
+        memory_limit: usize,
+    ) -> databend_common_exception::Result<Vec<u8>> {
         let mut zip = ZipArchive::new(Cursor::new(compressed)).map_err(|e| {
             ErrorCode::InvalidCompressionData(format!("compression data invalid: {e}"))
         })?;
         if zip.len() > 1 {
+            // if we want to support loading of zip with multi files later, need to resolve
+            // 1. separate output bytes of files
+            // 2. a formal way to repr the path of each file, for metadata and error reporting
+            // 3. atomic file load
             return Err(ErrorCode::InvalidCompressionData(
                 "Zip only supports single file",
             ));
         }
+        if memory_limit > 0 {
+            if let Some(size) = zip.decompressed_size() {
+                if size + compressed.len() as u128 > memory_limit as u128 / 2 {
+                    return Err(ErrorCode::BadBytes(format!(
+                        "zip file {path} is too large, decompressed_size = {size}",
+                    )));
+                }
+            }
+        }
         let mut file = zip.by_index(0).map_err(|e| {
             ErrorCode::InvalidCompressionData(format!("compression data invalid: {e}"))
         })?;
         let mut bytes = Vec::new();
+        // todo: split to 16MB batches
         file.read_to_end(&mut bytes)?;
-
         Ok(bytes)
     }
 
@@ -611,7 +628,7 @@ mod tests {
         rng.fill_bytes(&mut content);
 
         let compressed_content = CompressCodec::compress_all_zip(&content, "unload.csv")?;
-        let result = DecompressDecoder::decompress_all_zip(&compressed_content)?;
+        let result = DecompressDecoder::decompress_all_zip(&compressed_content, "", 0)?;
 
         assert_eq!(result, content);
 
diff --git a/src/query/sql/src/planner/semantic/type_check.rs b/src/query/sql/src/planner/semantic/type_check.rs
@@ -51,6 +51,7 @@ use databend_common_ast::parser::parse_expr;
 use databend_common_ast::parser::tokenize_sql;
 use databend_common_ast::parser::Dialect;
 use databend_common_ast::Span;
+use databend_common_base::runtime::GLOBAL_MEM_STAT;
 use databend_common_catalog::catalog::CatalogManager;
 use databend_common_catalog::plan::InternalColumn;
 use databend_common_catalog::plan::InternalColumnType;
@@ -5405,7 +5406,11 @@ impl<'a> TypeChecker<'a> {
             Some(algo) => {
                 log::trace!("Decompressing module using {:?} algorithm", algo);
                 if algo == CompressAlgorithm::Zip {
-                    DecompressDecoder::decompress_all_zip(&code_blob)
+                    DecompressDecoder::decompress_all_zip(
+                        &code_blob,
+                        &module_path,
+                        GLOBAL_MEM_STAT.get_limit() as usize,
+                    )
                 } else {
                     let mut decoder = DecompressDecoder::new(algo);
                     decoder.decompress_all(&code_blob)
diff --git a/src/query/storages/stage/src/read/row_based/processors/decompressor.rs b/src/query/storages/stage/src/read/row_based/processors/decompressor.rs
@@ -14,6 +14,7 @@
 
 use std::sync::Arc;
 
+use databend_common_base::runtime::GLOBAL_MEM_STAT;
 use databend_common_compress::CompressAlgorithm;
 use databend_common_compress::DecompressDecoder;
 use databend_common_compress::DecompressState;
@@ -32,6 +33,7 @@ pub struct Decompressor {
     algo: Option<CompressAlgorithm>,
     decompressor: Option<(DecompressDecoder, usize)>,
     path: Option<String>,
+    zip_buf: Vec<u8>,
 }
 
 impl Decompressor {
@@ -41,6 +43,7 @@ impl Decompressor {
             algo,
             path: None,
             decompressor: None,
+            zip_buf: Vec::new(),
         })
     }
 
@@ -55,6 +58,7 @@ impl Decompressor {
 
         if let Some(algo) = algo {
             if matches!(algo, CompressAlgorithm::Zip) {
+                self.zip_buf.clear();
                 return;
             }
             let decompressor = DecompressDecoder::new(algo);
@@ -82,15 +86,33 @@ impl AccumulatingTransform for Decompressor {
             }
         }
         if matches!(self.algo, Some(CompressAlgorithm::Zip)) {
-            let bytes = DecompressDecoder::decompress_all_zip(&batch.data)?;
+            let memory_limit = GLOBAL_MEM_STAT.get_limit() as usize;
+            if memory_limit > 0 && self.zip_buf.len() + batch.data.len() > memory_limit / 3 {
+                return Err(ErrorCode::BadBytes(format!(
+                    "zip file {} is larger than memory_limit/3 ({})",
+                    batch.path,
+                    memory_limit / 3
+                )));
+            }
+            self.zip_buf.extend_from_slice(&batch.data);
 
-            let new_batch = Box::new(BytesBatch {
-                data: bytes,
-                path: batch.path.clone(),
-                offset: batch.data.len(),
-                is_eof: batch.is_eof,
-            });
-            return Ok(vec![DataBlock::empty_with_meta(new_batch)]);
+            return if batch.is_eof {
+                let bytes = DecompressDecoder::decompress_all_zip(
+                    &self.zip_buf,
+                    &batch.path,
+                    memory_limit,
+                )?;
+                let new_batch = Box::new(BytesBatch {
+                    data: bytes,
+                    path: batch.path.clone(),
+                    offset: 0,
+                    is_eof: batch.is_eof,
+                });
+                self.zip_buf.clear();
+                Ok(vec![DataBlock::empty_with_meta(new_batch)])
+            } else {
+                Ok(vec![])
+            };
         }
         if let Some((de, offset)) = &mut self.decompressor {
             let mut data = de.decompress_batch(&batch.data).map_err(|e| {
diff --git a/tests/data/ontime_200.csv.zip b/tests/data/ontime_200.csv.zip
diff --git a/tests/sqllogictests/suites/stage/on_time.test b/tests/sqllogictests/suites/stage/on_time.test
@@ -1,4 +1,8 @@
 # use prepared data ontime
+# Test with small buffer to simulate chunked reading
+statement ok
+set input_read_buffer_size = 1024
+
 statement ok
 truncate table ontime
 
@@ -92,3 +96,16 @@ select count(1), avg(Year), sum(DayOfWeek)  from ontime
 
 statement ok
 truncate table ontime
+
+query TIITI
+copy into ontime from @data/ontime_200.csv.zip pattern = '' FILE_FORMAT = (type = csv skip_header = 1 compression = 'zip')
+----
+ontime_200.csv.zip	199	0	NULL	NULL
+
+query III
+select count(1), avg(Year), sum(DayOfWeek)  from ontime
+----
+199 2020.0 769
+
+statement ok
+truncate table ontime