Skip to content

Commit f508b17

Browse files
authored
Bump JSONL line length limit to 1MB (from 64k) (#209)
Without explicitly setting a maximum size of the buffer that may be allocated during scanning, Go's `bufio` defaults to 64k. See - https://pkg.go.dev/bufio#pkg-constants, and - https://pkg.go.dev/bufio#Scanner.Buffer
1 parent 3e02bb2 commit f508b17

File tree

2 files changed

+60
-27
lines changed

2 files changed

+60
-27
lines changed

runner/file.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,10 +437,19 @@ func transformJSONConcatFile(in string, out io.Writer) error {
437437
return transformJSONConcat(r, out)
438438
}
439439

440+
func newLargeLineScanner(in io.Reader) *bufio.Scanner {
441+
scanner := bufio.NewScanner(in)
442+
443+
// Bump line length limit to 1MB (from 64k)
444+
buf := make([]byte, 4096)
445+
scanner.Buffer(buf, 1024*1024)
446+
return scanner
447+
}
448+
440449
func transformJSONLines(in io.Reader, out io.Writer) error {
441450
first := true
442451
return withJSONOutWriter(out, "[", "]", func() error {
443-
scanner := bufio.NewScanner(in)
452+
scanner := newLargeLineScanner(in)
444453
for scanner.Scan() {
445454
if !first {
446455
_, err := out.Write([]byte(",\n"))
@@ -477,7 +486,7 @@ var BUILTIN_REGEX = map[MimeType]*regexp.Regexp{
477486
}
478487

479488
func transformRegexp(in io.Reader, out io.Writer, re *regexp.Regexp) error {
480-
scanner := bufio.NewScanner(in)
489+
scanner := newLargeLineScanner(in)
481490
return withJSONArrayOutWriterFile(out, func(w *jsonutil.StreamEncoder) error {
482491
row := map[string]any{}
483492
for scanner.Scan() {

runner/file_test.go

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"io/ioutil"
77
"math/rand"
88
"os"
9+
"strings"
910
"testing"
1011
"time"
1112

@@ -14,36 +15,59 @@ import (
1415
)
1516

1617
func Test_transformJSONLines(t *testing.T) {
17-
tmp, err := ioutil.TempFile("", "")
18-
defer os.Remove(tmp.Name())
19-
assert.Nil(t, err)
18+
longString := strings.Repeat("Omnis ut ut voluptatem provident eaque necessitatibus quia. Eos veniam qui. ", 1024) // 76kb
19+
tests := []struct {
20+
input string
21+
output []map[string]any
22+
}{
23+
{
24+
`{"a": 1, "b": 2}
25+
{"a": 2, "b": 3}`,
26+
[]map[string]any{
27+
{
28+
"a": float64(1),
29+
"b": float64(2),
30+
},
31+
{
32+
"a": float64(2),
33+
"b": float64(3),
34+
},
35+
},
36+
},
37+
{
38+
`{"a": 1, "b": "` + longString + `"}`,
39+
[]map[string]any{
40+
{
41+
"a": float64(1),
42+
"b": longString,
43+
},
44+
},
45+
},
46+
}
2047

21-
tmp.WriteString(`{"a": 1, "b": 2}
22-
{"a": 2, "b": 3}`)
48+
for _, test := range tests {
49+
tmp, err := ioutil.TempFile("", "")
50+
assert.Nil(t, err)
2351

24-
tmp2, err := ioutil.TempFile("", "")
25-
defer os.Remove(tmp2.Name())
26-
assert.Nil(t, err)
52+
tmp.WriteString(test.input)
2753

28-
err = transformJSONLinesFile(tmp.Name(), tmp2)
29-
assert.Nil(t, err)
54+
tmp2, err := ioutil.TempFile("", "")
55+
assert.Nil(t, err)
3056

31-
var m []map[string]any
32-
tmp2Bs, err := ioutil.ReadFile(tmp2.Name())
33-
assert.Nil(t, err)
34-
err = json.Unmarshal(tmp2Bs, &m)
35-
assert.Nil(t, err)
57+
err = transformJSONLinesFile(tmp.Name(), tmp2)
58+
assert.Nil(t, err)
3659

37-
assert.Equal(t, []map[string]any{
38-
{
39-
"a": float64(1),
40-
"b": float64(2),
41-
},
42-
{
43-
"a": float64(2),
44-
"b": float64(3),
45-
},
46-
}, m)
60+
var m []map[string]any
61+
tmp2Bs, err := ioutil.ReadFile(tmp2.Name())
62+
assert.Nil(t, err)
63+
err = json.Unmarshal(tmp2Bs, &m)
64+
assert.Nil(t, err)
65+
66+
assert.Equal(t, test.output, m)
67+
68+
os.Remove(tmp.Name())
69+
os.Remove(tmp2.Name())
70+
}
4771
}
4872

4973
func Test_parquet(t *testing.T) {

0 commit comments

Comments
 (0)