Add support for DuckDB underscore numeric literals

hamilton · hamilton · commit f200db90b37b · 2025-08-11T16:31:12.000-07:00
- Add numberRegex option to TokenizerOptions to allow custom number patterns
- Update Tokenizer to use custom number regex when provided
- Configure DuckDB formatter to support underscore separators in numbers (1_000_000)
- Add test for underscore numeric literals in DuckDB
diff --git a/src/languages/duckdb/duckdb.formatter.ts b/src/languages/duckdb/duckdb.formatter.ts
@@ -155,6 +155,9 @@ export const duckdb: DialectOptions = {
     reservedFunctionNames: functions,
     nestedBlockComments: true,
     extraParens: ['[]', '{}'],
+    // Support underscore separators in numeric literals (e.g., 1_000_000)
+    numberRegex:
+      /(?:0x[0-9a-fA-F_]+|0b[01_]+|(?:-\s*)?(?:[0-9_]*\.[0-9_]+|[0-9_]+(?:\.[0-9_]*)?)(?:[eE][-+]?[0-9_]+(?:\.[0-9_]+)?)?)(?![\w\p{Alphabetic}])/uy,
     stringTypes: [
       '$$',
       "''-qq",
diff --git a/src/lexer/Tokenizer.ts b/src/lexer/Tokenizer.ts
@@ -51,6 +51,7 @@ export default class Tokenizer {
       {
         type: TokenType.NUMBER,
         regex:
+          cfg.numberRegex ??
           /(?:0x[0-9a-fA-F]+|0b[01]+|(?:-\s*)?(?:[0-9]*\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][-+]?[0-9]+(?:\.[0-9]+)?)?)(?![\w\p{Alphabetic}])/uy,
       },
       // RESERVED_PHRASE is matched before all other keyword tokens
diff --git a/src/lexer/TokenizerOptions.ts b/src/lexer/TokenizerOptions.ts
@@ -100,6 +100,8 @@ export interface TokenizerOptions {
   propertyAccessOperators?: string[];
   // Enables PostgreSQL-specific OPERATOR(...) syntax
   operatorKeyword?: boolean;
+  // Custom regex pattern for number tokens (defaults to standard SQL number pattern)
+  numberRegex?: RegExp;
   // Allows custom modifications on the token array.
   // Called after the whole input string has been split into tokens.
   // The result of this will be the output of the tokenizer.
diff --git a/test/duckdb.test.ts b/test/duckdb.test.ts
@@ -214,4 +214,15 @@ describe('DuckDBFormatter', () => {
         1 IS NOT NULL;
     `);
   });
+
+  it('supports underscore separators in numeric literals', () => {
+    expect(format('SELECT 1_000_000, 3.14_159, 0x1A_2B_3C, 0b1010_0001, 1.5e+1_0;')).toBe(dedent`
+      SELECT
+        1_000_000,
+        3.14_159,
+        0x1A_2B_3C,
+        0b1010_0001,
+        1.5e+1_0;
+    `);
+  });
 });

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@ export default class Tokenizer {`
`51`	`51`	`{`
`52`	`52`	`type: TokenType.NUMBER,`
`53`	`53`	`regex:`
	`54`	`+ cfg.numberRegex ??`
`54`	`55`	`/(?:0x[0-9a-fA-F]+\|0b[01]+\|(?:-\s)?(?:[0-9]\.[0-9]+\|[0-9]+(?:\.[0-9]*)?)(?:[eE][-+]?[0-9]+(?:\.[0-9]+)?)?)(?![\w\p{Alphabetic}])/uy,`
`55`	`56`	`},`
`56`	`57`	`// RESERVED_PHRASE is matched before all other keyword tokens`