timescale · akuzm · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/tsl/src/compression/arrow_c_data_interface.h b/tsl/src/compression/arrow_c_data_interface.h
@@ -184,12 +184,18 @@ arrow_set_row_validity(uint64 *bitmap, size_t row_number, bool value)
 }
 
 /*
- * Combine the validity bitmaps into the given storage.
+ * Combine the validity bitmaps into the given storage. Can return one of the
+ * input filters if the others are NULL.
  */
-static inline const uint64 *
+static inline pg_nodiscard const uint64 *
 arrow_combine_validity(size_t num_words, uint64 *restrict storage, const uint64 *filter1,
 					   const uint64 *filter2, const uint64 *filter3)
 {
+	Assert(num_words != 0);
+	Assert(storage != filter1);
+	Assert(storage != filter2);
+	Assert(storage != filter3);
+
 	/*
 	 * Any and all of the filters can be null. For simplicity, move the non-null
 	 * filters to the leading positions.
@@ -256,6 +262,23 @@ arrow_combine_validity(size_t num_words, uint64 *restrict storage, const uint64
 	return storage;
 }
 
+/*
+ * Do the &= operation on bitmaps. The right argument can be NULL.
+ */
+static inline void
+arrow_validity_and(int num_words, uint64 *restrict left, const uint64 *right)
+{
+	if (right == NULL)
+	{
+		return;
+	}
+
+	for (int i = 0; i < num_words; i++)
+	{
+		left[i] &= right[i];
+	}
+}
+
 /*
  * Increase the `source_value` to be an even multiple of `pad_to`.
  */
@@ -268,6 +291,8 @@ pad_to_multiple(uint64 pad_to, uint64 source_value)
 static inline int
 arrow_num_valid(const uint64 *bitmap, size_t total_rows)
 {
+	Assert(total_rows != 0);
+
 	if (bitmap == NULL)
 	{
 		return total_rows;

diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.c b/tsl/src/nodes/decompress_chunk/compressed_batch.c
@@ -143,7 +143,7 @@ make_single_value_arrow(Oid pgtype, Datum datum, bool isnull)
 	return make_single_value_arrow_arithmetic(pgtype, datum, isnull);
 }
 
-static int
+int
 get_max_text_datum_size(ArrowArray *text_array)
 {
 	int maxbytes = 0;
@@ -1079,21 +1079,6 @@ compressed_batch_set_compressed_tuple(DecompressContext *dcontext,
 	}
 }
 
-static void
-store_text_datum(CompressedColumnValues *column_values, int arrow_row)
-{
-	const uint32 start = ((uint32 *) column_values->buffers[1])[arrow_row];
-	const int32 value_bytes = ((uint32 *) column_values->buffers[1])[arrow_row + 1] - start;
-	Assert(value_bytes >= 0);
-
-	const int total_bytes = value_bytes + VARHDRSZ;
-	Assert(DatumGetPointer(*column_values->output_value) != NULL);
-	SET_VARSIZE(*column_values->output_value, total_bytes);
-	memcpy(VARDATA(*column_values->output_value),
-		   &((uint8 *) column_values->buffers[2])[start],
-		   value_bytes);
-}
-
 /*
  * Construct the next tuple in the decompressed scan slot.
  * Doesn't check the quals.
@@ -1106,83 +1091,9 @@ make_next_tuple(DecompressBatchState *batch_state, uint16 arrow_row, int num_dat
 	Assert(batch_state->total_batch_rows > 0);
 	Assert(batch_state->next_batch_row < batch_state->total_batch_rows);
 
-	for (int i = 0; i < num_data_columns; i++)
-	{
-		CompressedColumnValues *column_values = &batch_state->compressed_columns[i];
-		if (column_values->decompression_type == DT_Iterator)
-		{
-			DecompressionIterator *iterator = (DecompressionIterator *) column_values->buffers[0];
-			DecompressResult result = iterator->try_next(iterator);
-
-			if (result.is_done)
-			{
-				elog(ERROR, "compressed column out of sync with batch counter");
-			}
-
-			*column_values->output_isnull = result.is_null;
-			*column_values->output_value = result.val;
-		}
-		else if (column_values->decompression_type > SIZEOF_DATUM)
-		{
-			/*
-			 * Fixed-width by-reference type that doesn't fit into a Datum.
-			 * For now this only happens for 8-byte types on 32-bit systems,
-			 * but eventually we could also use it for bigger by-value types
-			 * such as UUID.
-			 */
-			const uint8 value_bytes = column_values->decompression_type;
-			const char *src = column_values->buffers[1];
-			*column_values->output_value = PointerGetDatum(&src[value_bytes * arrow_row]);
-			*column_values->output_isnull =
-				!arrow_row_is_valid(column_values->buffers[0], arrow_row);
-		}
-		else if (column_values->decompression_type == DT_ArrowBits)
-		{
-			/*
-			 * The DT_ArrowBits type is a special case, because the value is
-			 * stored as an Array of bits.
-			 */
-			*column_values->output_value =
-				BoolGetDatum(arrow_row_is_valid(column_values->buffers[1], arrow_row));
-			*column_values->output_isnull =
-				!arrow_row_is_valid(column_values->buffers[0], arrow_row);
-		}
-		else if (column_values->decompression_type > 0)
-		{
-			/*
-			 * Fixed-width by-value type that fits into a Datum.
-			 *
-			 * The conversion of Datum to more narrow types will truncate
-			 * the higher bytes, so we don't care if we read some garbage
-			 * into them, and can always read 8 bytes. These are unaligned
-			 * reads, so technically we have to do memcpy.
-			 */
-			const uint8 value_bytes = column_values->decompression_type;
-			Assert(value_bytes <= SIZEOF_DATUM);
-			const char *src = column_values->buffers[1];
-			memcpy(column_values->output_value, &src[value_bytes * arrow_row], SIZEOF_DATUM);
-			*column_values->output_isnull =
-				!arrow_row_is_valid(column_values->buffers[0], arrow_row);
-		}
-		else if (column_values->decompression_type == DT_ArrowText)
-		{
-			store_text_datum(column_values, arrow_row);
-			*column_values->output_isnull =
-				!arrow_row_is_valid(column_values->buffers[0], arrow_row);
-		}
-		else if (column_values->decompression_type == DT_ArrowTextDict)
-		{
-			const int16 index = ((int16 *) column_values->buffers[3])[arrow_row];
-			store_text_datum(column_values, index);
-			*column_values->output_isnull =
-				!arrow_row_is_valid(column_values->buffers[0], arrow_row);
-		}
-		else
-		{
-			/* A compressed column with default value, do nothing. */
-			Assert(column_values->decompression_type == DT_Scalar);
-		}
-	}
+	compressed_columns_to_postgres_data(batch_state->compressed_columns,
+										num_data_columns,
+										arrow_row);
 
 	/*
 	 * It's a virtual tuple slot, so no point in clearing/storing it

diff --git a/tsl/src/nodes/decompress_chunk/compressed_batch.h b/tsl/src/nodes/decompress_chunk/compressed_batch.h
@@ -197,3 +197,119 @@ typedef struct CompressedBatchVectorQualState
 
 const ArrowArray *compressed_batch_get_arrow_array(VectorQualState *vqstate, Expr *expr,
 												   bool *is_default_value);
+int get_max_text_datum_size(ArrowArray *text_array);
+
+inline static void
+store_text_datum(CompressedColumnValues *column_values, int arrow_row)
+{
+	const uint32 start = ((uint32 *) column_values->buffers[1])[arrow_row];
+	const int32 value_bytes = ((uint32 *) column_values->buffers[1])[arrow_row + 1] - start;
+	Assert(value_bytes >= 0);
+
+	const int total_bytes = value_bytes + VARHDRSZ;
+	Assert(DatumGetPointer(*column_values->output_value) != NULL);
+	SET_VARSIZE(*column_values->output_value, total_bytes);
+	memcpy(VARDATA(*column_values->output_value),
+		   &((uint8 *) column_values->buffers[2])[start],
+		   value_bytes);
+}
+
+inline static void
+compressed_columns_to_postgres_data(CompressedColumnValues *columns, int num_data_columns,
+									uint16 arrow_row)
+{
+	for (int i = 0; i < num_data_columns; i++)
+	{
+		CompressedColumnValues *column_values = &columns[i];
+		switch ((int) column_values->decompression_type)
+		{
+			case DT_Iterator:
+			{
+				DecompressionIterator *iterator =
+					(DecompressionIterator *) column_values->buffers[0];
+				DecompressResult result = iterator->try_next(iterator);
+
+				if (result.is_done)
+				{
+					elog(ERROR, "compressed column out of sync with batch counter");
+				}
+
+				*column_values->output_isnull = result.is_null;
+				*column_values->output_value = result.val;
+				break;
+			}
+#ifndef USE_FLOAT8_BYVAL
+			case 8:
+#endif
+			case 16:
+			{
+				/*
+				 * Fixed-width by-reference type that doesn't fit into a Datum.
+				 * For now this only happens for 8-byte types on 32-bit systems,
+				 * but eventually we could also use it for bigger by-value types
+				 * such as UUID.
+				 */
+				const uint8 value_bytes = column_values->decompression_type;
+				const char *src = column_values->buffers[1];
+				*column_values->output_value = PointerGetDatum(&src[value_bytes * arrow_row]);
+				*column_values->output_isnull =
+					!arrow_row_is_valid(column_values->buffers[0], arrow_row);
+				break;
+			}
+			case DT_ArrowBits:
+			{
+				/*
+				 * The DT_ArrowBits type is a special case, because the value is
+				 * stored as an Array of bits.
+				 */
+				*column_values->output_value =
+					BoolGetDatum(arrow_row_is_valid(column_values->buffers[1], arrow_row));
+				*column_values->output_isnull =
+					!arrow_row_is_valid(column_values->buffers[0], arrow_row);
+				break;
+			}
+			case 2:
+			case 4:
+#ifdef USE_FLOAT8_BYVAL
+			case 8:
+#endif
+			{
+				/*
+				 * Fixed-width by-value type that fits into a Datum.
+				 *
+				 * The conversion of Datum to more narrow types will truncate
+				 * the higher bytes, so we don't care if we read some garbage
+				 * into them, and can always read 8 bytes. These are unaligned
+				 * reads, so technically we have to do memcpy.
+				 */
+				const uint8 value_bytes = column_values->decompression_type;
+				Assert(value_bytes <= SIZEOF_DATUM);
+				const char *src = column_values->buffers[1];
+				memcpy(column_values->output_value, &src[value_bytes * arrow_row], SIZEOF_DATUM);
+				*column_values->output_isnull =
+					!arrow_row_is_valid(column_values->buffers[0], arrow_row);
+				break;
+			}
+			case DT_ArrowText:
+			{
+				store_text_datum(column_values, arrow_row);
+				*column_values->output_isnull =
+					!arrow_row_is_valid(column_values->buffers[0], arrow_row);
+				break;
+			}
+			case DT_ArrowTextDict:
+			{
+				const int16 index = ((int16 *) column_values->buffers[3])[arrow_row];
+				store_text_datum(column_values, index);
+				*column_values->output_isnull =
+					!arrow_row_is_valid(column_values->buffers[0], arrow_row);
+				break;
+			}
+			default:
+			{
+				/* A compressed column with default value, do nothing. */
+				Assert(column_values->decompression_type == DT_Scalar);
+			}
+		}
+	}
+}
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.c b/tsl/src/nodes/decompress_chunk/vector_predicates.c
@@ -303,3 +303,31 @@ vector_booleantest(const ArrowArray *arrow, int test_type, uint64 *restrict resu
 			break;
 	}
 }
+
+static void
+vector_int8pl(const ArrowArray **args, int nargs, ArrowArray *result)
+{
+	Ensure(nargs == 2, "wrong number of arguments %d given to function %s", nargs, __FUNCTION__);
+
+	const int n = args[0]->length;
+	Ensure(args[1]->length == n, "argument length mismatch");
+	Ensure(result->length == n, "result length mismatch");
+
+	int64 *restrict values = (int64 *) result->buffers[1];
+	for (int i = 0; i < n; i++)
+	{
+		values[i] =
+			((const int64 *) args[0]->buffers[1])[i] + ((const int64 *) args[1]->buffers[1])[i];
+	}
+}
+
+VectorFunction *
+get_vector_function(Oid pg_function)
+{
+	switch (pg_function)
+	{
+		case F_INT8PL:
+			return vector_int8pl;
+	}
+	return NULL;
+}
diff --git a/tsl/src/nodes/decompress_chunk/vector_predicates.h b/tsl/src/nodes/decompress_chunk/vector_predicates.h
@@ -13,6 +13,10 @@ typedef void(VectorPredicate)(const ArrowArray *, Datum, uint64 *restrict);
 
 VectorPredicate *get_vector_const_predicate(Oid pg_predicate);
 
+typedef void(VectorFunction)(const ArrowArray **args, int nargs, ArrowArray *result);
+
+VectorFunction *get_vector_function(Oid pg_function);
+
 void vector_array_predicate(VectorPredicate *vector_const_predicate, bool is_or,
 							const ArrowArray *vector, Datum array, uint64 *restrict final_result);