feat: produce snapshot chunks lazily

praveen-influx · praveen-influx · commit 56024afabbc0 · 2025-02-20T10:44:40.000Z
diff --git a/influxdb3/src/commands/serve.rs b/influxdb3/src/commands/serve.rs
@@ -573,8 +573,8 @@ pub async fn command(config: Config) -> Result<()> {
 
     info!("setting up background mem check for query buffer");
     background_buffer_checker(
-        // config.force_snapshot_mem_threshold.bytes(),
-        734003200,
+        config.force_snapshot_mem_threshold.bytes(),
+        // 734003200,
         &write_buffer_impl,
     )
     .await;
diff --git a/influxdb3_write/src/paths.rs b/influxdb3_write/src/paths.rs
@@ -55,6 +55,7 @@ pub struct ParquetFilePath(ObjPath);
 impl ParquetFilePath {
     /// Generate a parquet file path using the given arguments. This will convert the provided
     /// `chunk_time` into a date time string with format `'YYYY-MM-DD/HH-MM'`
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         host_prefix: &str,
         db_name: &str,
@@ -63,14 +64,26 @@ impl ParquetFilePath {
         table_id: u32,
         chunk_time: i64,
         wal_file_sequence_number: WalFileSequenceNumber,
+        sub_chunk_index: Option<u64>,
     ) -> Self {
         let date_time = DateTime::<Utc>::from_timestamp_nanos(chunk_time);
-        let path = ObjPath::from(format!(
-            "{host_prefix}/dbs/{db_name}-{db_id}/{table_name}-{table_id}/{date_string}/{wal_seq:010}.{ext}",
-            date_string = date_time.format("%Y-%m-%d/%H-%M"),
-            wal_seq = wal_file_sequence_number.as_u64(),
-            ext = PARQUET_FILE_EXTENSION
-        ));
+        let path = if sub_chunk_index.is_some() {
+            ObjPath::from(format!(
+                "{host_prefix}/dbs/{db_name}-{db_id}/{table_name}-{table_id}/{date_string}/{wal_seq:010}-{chunk_idx}.{ext}",
+                date_string = date_time.format("%Y-%m-%d/%H-%M"),
+                wal_seq = wal_file_sequence_number.as_u64(),
+                chunk_idx = sub_chunk_index.unwrap(),
+                ext = PARQUET_FILE_EXTENSION
+            ))
+
+        } else {
+            ObjPath::from(format!(
+                "{host_prefix}/dbs/{db_name}-{db_id}/{table_name}-{table_id}/{date_string}/{wal_seq:010}.{ext}",
+                date_string = date_time.format("%Y-%m-%d/%H-%M"),
+                wal_seq = wal_file_sequence_number.as_u64(),
+                ext = PARQUET_FILE_EXTENSION
+            ))
+        };
         Self(path)
     }
 }
@@ -143,6 +156,7 @@ fn parquet_file_path_new() {
                 .timestamp_nanos_opt()
                 .unwrap(),
             WalFileSequenceNumber::new(1337),
+            None,
         ),
         ObjPath::from("my_host/dbs/my_db-0/my_table-0/2038-01-19/03-14/0000001337.parquet")
     );
@@ -162,6 +176,7 @@ fn parquet_file_percent_encoded() {
                 .timestamp_nanos_opt()
                 .unwrap(),
             WalFileSequenceNumber::new(100),
+            None,
         )
         .as_ref()
         .as_ref(),
diff --git a/influxdb3_write/src/persister.rs b/influxdb3_write/src/persister.rs
@@ -968,6 +968,7 @@ mod tests {
             0,
             Utc::now().timestamp_nanos_opt().unwrap(),
             WalFileSequenceNumber::new(1),
+            None,
         );
         let (bytes_written, meta, _) = persister
             .persist_parquet_file(path.clone(), stream_builder.build())
diff --git a/influxdb3_write/src/write_buffer/queryable_buffer.rs b/influxdb3_write/src/write_buffer/queryable_buffer.rs
@@ -1,4 +1,4 @@
-use crate::chunk::BufferChunk;
+use crate::{chunk::BufferChunk, write_buffer::table_buffer::SnaphotChunkIter};
 use crate::paths::ParquetFilePath;
 use crate::persister::Persister;
 use crate::write_buffer::persisted_files::PersistedFiles;
@@ -193,149 +193,134 @@ impl QueryableBuffer {
             for (database_id, table_map) in buffer.db_to_table.iter_mut() {
                 let db_schema = catalog.db_schema_by_id(database_id).expect("db exists");
                 for (table_id, table_buffer) in table_map.iter_mut() {
+                    info!(db_name = ?db_schema.name, ?table_id, ">>> working on db, table");
                     let table_def = db_schema
                         .table_definition_by_id(table_id)
                         .expect("table exists");
-                    let snapshot_chunks =
-                        table_buffer.snapshot(table_def, snapshot_details.end_time_marker);
+                    let sort_key = table_buffer.sort_key.clone();
+                    let all_keys_to_remove = table_buffer.get_keys_to_remove(snapshot_details.end_time_marker);
+                    info!(num_keys_to_remove = ?all_keys_to_remove.len(), ">>> num keys to remove");
+
+                    let chunk_time_to_chunk = &mut table_buffer.chunk_time_to_chunks;
+                    let snapshot_chunks = &mut table_buffer.snapshotting_chunks;
+                    let snapshot_chunks_iter = SnaphotChunkIter {
+                        keys_to_remove: all_keys_to_remove.iter(),
+                        map: chunk_time_to_chunk,
+                        table_def,
+                    };
 
-                    for chunk in snapshot_chunks {
+                    for chunk in snapshot_chunks_iter {
+                        debug!(">>> starting with new chunk");
                         let table_name =
                             db_schema.table_id_to_name(table_id).expect("table exists");
-                        // mapping between time to main record batch array's index
-                        let mut smaller_chunks: HashMap<i64, (MinMax, Vec<u64>)> = HashMap::new();
-                        let smaller_duration = Duration::from_secs(10).as_nanos() as i64;
-                        let all_times = chunk
-                            .record_batch
-                            .column_by_name("time")
-                            .expect("time col to be present")
-                            .as_primitive::<TimestampNanosecondType>()
-                            .values();
-                        for (idx, time) in all_times.iter().enumerate() {
-                            let smaller_chunk_time = time - (time % smaller_duration);
-                            let (min_max, vec_indices) = smaller_chunks
-                                .entry(smaller_chunk_time)
-                                .or_insert_with(|| (MinMax::new(i64::MAX, i64::MIN), Vec::new()));
-
-                            min_max.update(*time);
-                            vec_indices.push(idx as u64);
-                        }
 
-                        let total_row_count = chunk.record_batch.column(0).len();
+                        if snapshot_details.forced {
+                            // when forced, we're already under memory pressure so create smaller
+                            // chunks (by time) and they need to be non-overlapping.
+                            // 1. Create smaller groups (using smaller duration), 10 secs here
+                            let mut smaller_chunks: HashMap<i64, (MinMax, Vec<u64>)> = HashMap::new();
+                            let smaller_duration = Duration::from_secs(10).as_nanos() as i64;
+                            let all_times = chunk
+                                .record_batch
+                                .column_by_name("time")
+                                .expect("time col to be present")
+                                .as_primitive::<TimestampNanosecondType>()
+                                .values();
+
+                            for (idx, time) in all_times.iter().enumerate() {
+                                let smaller_chunk_time = time - (time % smaller_duration);
+                                let (min_max, vec_indices) = smaller_chunks
+                                    .entry(smaller_chunk_time)
+                                    .or_insert_with(|| (MinMax::new(i64::MAX, i64::MIN), Vec::new()));
+
+                                min_max.update(*time);
+                                vec_indices.push(idx as u64);
+                            }
 
-                        for (smaller_chunk_time, (min_max, all_indexes)) in smaller_chunks.iter() {
-                            debug!(
-                                ?smaller_chunk_time,
-                                ?min_max,
-                                num_indexes = ?all_indexes.len(),
-                                ?total_row_count,
-                                ">>> number of small chunks");
-                        }
+                            let total_row_count = chunk.record_batch.column(0).len();
 
-                        // at this point we have a bucket for each 10 sec block, we can create
-                        // smaller record batches here but maybe wasteful if we ever needed one
-                        // batch (let's see how this works first and then decide what can happen)
-                        let batch_schema = chunk.record_batch.schema();
-                        debug!(schema = ?chunk.schema, ">>> influx schema");
-                        debug!(arrow_schema = ?batch_schema, ">>> batch schema");
-                        let parent_cols = chunk.record_batch.columns();
-
-                        for (smaller_chunk_time, (min_max, all_indexes)) in
-                            smaller_chunks.into_iter()
-                        {
-                            let mut smaller_chunk_cols = vec![];
-                            let indices = UInt64Array::from_iter(all_indexes);
-                            for arr in parent_cols {
-                                let filtered =
-                                    take(&arr, &indices, None)
-                                        .expect("index should be accessible in parent cols");
-
-                                debug!(smaller_chunk_len = ?filtered.len(), ">>> filtered size");
-                                smaller_chunk_cols.push(filtered);
+                            for (smaller_chunk_time, (min_max, all_indexes)) in smaller_chunks.iter() {
+                                debug!(
+                                    ?smaller_chunk_time,
+                                    ?min_max,
+                                    num_indexes = ?all_indexes.len(),
+                                    ?total_row_count,
+                                    ">>> number of small chunks");
                             }
-                            debug!(smaller_chunk_len = ?smaller_chunk_cols.len(), ">>> smaller chunks size");
-                            let smaller_rec_batch =
-                                RecordBatch::try_new(Arc::clone(&batch_schema), smaller_chunk_cols)
-                                    .expect("create smaller record batch");
+
+                            // 2. At this point we have a bucket for each 10 sec block with related
+                            //    indexes from main chunk. Use those indexes to "cheaply" create
+                            //    smaller record batches.
+                            let batch_schema = chunk.record_batch.schema();
+                            let parent_cols = chunk.record_batch.columns();
+
+                            for (loop_idx, (smaller_chunk_time, (min_max, all_indexes))) in
+                                smaller_chunks.into_iter().enumerate()
+                            {
+                                let mut smaller_chunk_cols = vec![];
+                                let indices = UInt64Array::from_iter(all_indexes);
+                                for arr in parent_cols {
+                                    // `take` here minimises allocations but is not completely free,
+                                    // it still needs to allocate for smaller batches. The
+                                    // allocations are in `ScalarBuffer::from_iter` under the hood
+                                    let filtered =
+                                        take(&arr, &indices, None)
+                                            .expect("index should be accessible in parent cols");
+
+                                    smaller_chunk_cols.push(filtered);
+                                }
+                                let smaller_rec_batch =
+                                    RecordBatch::try_new(Arc::clone(&batch_schema), smaller_chunk_cols)
+                                        .expect("create smaller record batch");
+                                let persist_job = PersistJob {
+                                    database_id: *database_id,
+                                    table_id: *table_id,
+                                    table_name: Arc::clone(&table_name),
+                                    chunk_time: smaller_chunk_time,
+                                    path: ParquetFilePath::new(
+                                        self.persister.node_identifier_prefix(),
+                                        db_schema.name.as_ref(),
+                                        database_id.as_u32(),
+                                        table_name.as_ref(),
+                                        table_id.as_u32(),
+                                        smaller_chunk_time,
+                                        snapshot_details.last_wal_sequence_number,
+                                        Some(loop_idx as u64),
+                                    ),
+                                    batch: smaller_rec_batch,
+                                    schema: chunk.schema.clone(),
+                                    timestamp_min_max: min_max.to_ts_min_max(),
+                                    sort_key: sort_key.clone(),
+                                };
+                                persisting_chunks.push(persist_job);
+                            }
+
+                        } else {
                             let persist_job = PersistJob {
                                 database_id: *database_id,
                                 table_id: *table_id,
                                 table_name: Arc::clone(&table_name),
-                                chunk_time: smaller_chunk_time,
+                                chunk_time: chunk.chunk_time,
                                 path: ParquetFilePath::new(
                                     self.persister.node_identifier_prefix(),
                                     db_schema.name.as_ref(),
                                     database_id.as_u32(),
                                     table_name.as_ref(),
                                     table_id.as_u32(),
-                                    smaller_chunk_time,
+                                    chunk.chunk_time,
                                     snapshot_details.last_wal_sequence_number,
+                                    None,
                                 ),
-                                batch: smaller_rec_batch,
-                                // this schema.clone() can be avoided?
+                                // these clones are cheap and done one at a time
+                                batch: chunk.record_batch.clone(),
                                 schema: chunk.schema.clone(),
-                                timestamp_min_max: min_max.to_ts_min_max(),
-                                sort_key: table_buffer.sort_key.clone(),
+                                timestamp_min_max: chunk.timestamp_min_max,
+                                sort_key: sort_key.clone(),
                             };
                             persisting_chunks.push(persist_job);
                         }
-                        // let fields = batch_schema
-                        //     .fields()
-                        //     .iter()
-                        //     .map(|field| SortField::new(field.data_type().clone()))
-                        //     .collect();
-                        // debug!(?fields, ">>> schema fields");
-                        //
-                        // let converter =
-                        //     RowConverter::new(fields).expect("row converter created from fields");
-                        // debug!(?converter, ">>> converter");
-                        //
-                        // let rows = converter
-                        //     .convert_columns(parent_cols)
-                        //     .expect("convert cols to rows to succeed");
-                        // debug!(?rows, ">>> all rows");
-                        //
-                        // for (smaller_chunk_time, (min_max, all_indexes)) in smaller_chunks.iter() {
-                        //
-                        //     // create a record batch using just all_indexes from parent recordbatch
-                        //     let all_rows = all_indexes
-                        //         .iter()
-                        //         .map(|idx| rows.row(*idx))
-                        //         .collect::<Vec<_>>();
-                        //     debug!(?rows, ">>> all filtered child rows");
-                        //
-                        //     // hmmm this conversion turns Dictionary types to StringArray, not sure
-                        //     // why
-                        //     let child_cols = converter
-                        //         .convert_rows(all_rows)
-                        //         .expect("should convert rows back to cols");
-                        //     debug!(?child_cols, ">>> all child cols");
-                        //
-                        //     let smaller_rec_batch =
-                        //         RecordBatch::try_new(Arc::clone(&batch_schema), child_cols)
-                        //             .expect("create smaller record batch");
-                        //     let persist_job = PersistJob {
-                        //         database_id: *database_id,
-                        //         table_id: *table_id,
-                        //         table_name: Arc::clone(&table_name),
-                        //         chunk_time: *smaller_chunk_time,
-                        //         path: ParquetFilePath::new(
-                        //             self.persister.node_identifier_prefix(),
-                        //             db_schema.name.as_ref(),
-                        //             database_id.as_u32(),
-                        //             table_name.as_ref(),
-                        //             table_id.as_u32(),
-                        //             *smaller_chunk_time,
-                        //             snapshot_details.last_wal_sequence_number,
-                        //         ),
-                        //         batch: smaller_rec_batch,
-                        //         // this schema.clone() can be avoided?
-                        //         schema: chunk.schema.clone(),
-                        //         timestamp_min_max: min_max.to_ts_min_max(),
-                        //         sort_key: table_buffer.sort_key.clone(),
-                        //     };
-                        //     persisting_chunks.push(persist_job);
-                        // }
+                        snapshot_chunks.push(chunk);
+                        debug!(">>> finished with chunk");
                     }
                 }
             }
diff --git a/influxdb3_write/src/write_buffer/table_buffer.rs b/influxdb3_write/src/write_buffer/table_buffer.rs