Skip to content

Commit aba843b

Browse files
authored
Add support for ORC file format (#194)
* Added ORC file support * Added test for ORC file support * Updated transformORC to use map for row * Updated test for transformORCFile * Changed m to be map slice rather than interface slice * Finished adding support for ORC and added option in UI * Fixed bug where ORC, Parquet, and ODS options were not displaying on desktop * Ran formatter
1 parent 6c773f1 commit aba843b

File tree

7 files changed

+141
-8
lines changed

7 files changed

+141
-8
lines changed

runner/file.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ import (
1111
"runtime"
1212
"strings"
1313

14-
"github.com/multiprocessio/go-json"
14+
jsonutil "github.com/multiprocessio/go-json"
1515
"github.com/multiprocessio/go-openoffice"
16+
"github.com/scritchley/orc"
1617

1718
"github.com/xitongsys/parquet-go-source/local"
1819
"github.com/xitongsys/parquet-go/reader"
@@ -142,6 +143,42 @@ func transformParquetFile(in string, out io.Writer) error {
142143
return transformParquet(r, out)
143144
}
144145

146+
func transformORC(in *orc.Reader, out io.Writer) error {
147+
cols := in.Schema().Columns()
148+
c := in.Select(cols...)
149+
150+
return withJSONArrayOutWriterFile(out, func(w *jsonutil.StreamEncoder) error {
151+
row := map[string]interface{}{}
152+
153+
for c.Stripes() {
154+
for c.Next() {
155+
r := c.Row()
156+
for i, col := range cols {
157+
row[col] = r[i]
158+
}
159+
160+
err := w.EncodeRow(row)
161+
if err != nil {
162+
return err
163+
}
164+
}
165+
}
166+
167+
return c.Err()
168+
169+
})
170+
}
171+
172+
func transformORCFile(in string, out io.Writer) error {
173+
r, err := orc.Open(in)
174+
if err != nil {
175+
return err
176+
}
177+
defer r.Close()
178+
179+
return transformORC(r, out)
180+
}
181+
145182
func writeSheet(rows [][]string, w *jsonutil.StreamEncoder) error {
146183
var header []string
147184
isHeader := true
@@ -481,6 +518,7 @@ const (
481518
ExcelOpenXMLMimeType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
482519
OpenOfficeSheetMimeType = "application/vnd.oasis.opendocument.spreadsheet"
483520
ParquetMimeType = "parquet"
521+
ORCMimeType = "orc"
484522
ApacheErrorMimeType = "text/apache2error"
485523
ApacheAccessMimeType = "text/apache2access"
486524
NginxAccessMimeType = "text/nginxaccess"
@@ -511,6 +549,8 @@ func GetMimeType(fileName string, ct ContentTypeInfo) MimeType {
511549
return OpenOfficeSheetMimeType
512550
case ".parquet":
513551
return ParquetMimeType
552+
case ".orc":
553+
return ORCMimeType
514554
}
515555

516556
return UnknownMimeType
@@ -552,6 +592,8 @@ func TransformFile(fileName string, cti ContentTypeInfo, out io.Writer) error {
552592
return transformXLSXFile(fileName, out)
553593
case ParquetMimeType:
554594
return transformParquetFile(fileName, out)
595+
case ORCMimeType:
596+
return transformORCFile(fileName, out)
555597
case JSONConcatMimeType:
556598
return transformJSONConcatFile(fileName, out)
557599
case RegexpLinesMimeType:

runner/file_test.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ import (
44
"encoding/json"
55
"fmt"
66
"io/ioutil"
7+
"math/rand"
78
"os"
89
"testing"
910
"time"
1011

12+
"github.com/scritchley/orc"
1113
"github.com/stretchr/testify/assert"
1214
)
1315

@@ -122,6 +124,73 @@ func Test_transformJSONConcat(t *testing.T) {
122124
}
123125
}
124126

127+
func Test_transformORCFile(t *testing.T) {
128+
inTmp, err := ioutil.TempFile("", "")
129+
assert.Nil(t, err)
130+
defer os.Remove(inTmp.Name())
131+
defer inTmp.Close()
132+
133+
// define column types for ORC file
134+
schema, err := orc.ParseSchema("struct<username:string,administrator:boolean,score:double,nested:struct<randomnumber:double,correct:boolean>>")
135+
assert.Nil(t, err)
136+
137+
w, err := orc.NewWriter(inTmp, orc.SetSchema(schema))
138+
assert.Nil(t, err)
139+
140+
length := 2 // number of rows to create
141+
142+
// will hold output data for test
143+
var expJson []map[string]interface{}
144+
145+
// generate test data
146+
for i := 0; i < length; i++ {
147+
nestedValues := []interface{}{
148+
rand.Float64(),
149+
rand.Int63n(10000) > 5000,
150+
}
151+
152+
values := []interface{}{
153+
fmt.Sprintf("%x", rand.Int63n(1000)),
154+
rand.Int63n(10000) > 4444,
155+
rand.Float64(),
156+
nestedValues,
157+
}
158+
159+
expJson = append(expJson, map[string]interface{}{
160+
"username": values[0],
161+
"administrator": values[1],
162+
"score": values[2],
163+
"nested": map[string]interface{}{
164+
"randomnumber": nestedValues[0],
165+
"correct": nestedValues[1],
166+
},
167+
})
168+
169+
err = w.Write(values...)
170+
assert.Nil(t, err)
171+
}
172+
173+
err = w.Close()
174+
assert.Nil(t, err)
175+
176+
outTmp, err := ioutil.TempFile("", "")
177+
defer os.Remove(outTmp.Name())
178+
assert.Nil(t, err)
179+
180+
err = transformORCFile(inTmp.Name(), outTmp)
181+
assert.Nil(t, err)
182+
183+
var m []map[string]interface{}
184+
outTmpBs, err := ioutil.ReadFile(outTmp.Name())
185+
assert.Nil(t, err)
186+
187+
err = json.Unmarshal(outTmpBs, &m)
188+
assert.Nil(t, err)
189+
190+
assert.Equal(t, expJson, m)
191+
192+
}
193+
125194
func Test_transformGeneric(t *testing.T) {
126195
tests := []struct {
127196
in string

runner/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ require (
8989
github.com/prometheus/procfs v0.7.3 // indirect
9090
github.com/richardlehane/mscfb v1.0.3 // indirect
9191
github.com/richardlehane/msoleps v1.0.1 // indirect
92+
github.com/scritchley/orc v0.0.0-20210513144143-06dddf1ad665 // indirect
9293
github.com/shopspring/decimal v1.3.1 // indirect
9394
github.com/sirupsen/logrus v1.8.1 // indirect
9495
github.com/xuri/efp v0.0.0-20210322160811-ab561f5b45e3 // indirect

runner/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,8 @@ github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTK
490490
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
491491
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
492492
github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
493+
github.com/scritchley/orc v0.0.0-20210513144143-06dddf1ad665 h1:W7Y6ejGhTaW9WlWhTtxE8f+SOa3c1NoFWsU9XT2cUOY=
494+
github.com/scritchley/orc v0.0.0-20210513144143-06dddf1ad665/go.mod h1:U4h1RViHcbDQl9stSaImdd7N3/ZnUkZ2yombj5cSgEY=
493495
github.com/shirou/gopsutil v2.19.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
494496
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
495497
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=

runner/http.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,22 @@ func TransformReader(r io.Reader, fileName string, cti ContentTypeInfo, out io.W
260260
}
261261

262262
return transformParquetFile(w.Name(), out)
263+
case ORCMimeType:
264+
w, err := ioutil.TempFile("", "http-orc-temp")
265+
if err != nil {
266+
return err
267+
}
268+
defer os.Remove(w.Name())
269+
270+
_, err = w.ReadFrom(r)
271+
if err == io.EOF {
272+
err = nil
273+
}
274+
if err != nil {
275+
return err
276+
}
277+
278+
return transformORCFile(w.Name(), out)
263279
case RegexpLinesMimeType:
264280
// There are probably weird cases this won't work but
265281
// let's wait for a bug report to do more intelligent

ui/components/ContentTypePicker.tsx

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,15 @@ export function ContentTypePicker({
3838
<option value="text/csv">CSV</option>
3939
<option value="text/tab-separated-values">TSV</option>
4040
<option value={XLSX_MIME_TYPE}>Excel</option>
41-
{!inMemoryEval /* This is getting ridiculous. Really need to find a plugin architecture */ && (
42-
<React.Fragment>
43-
<option value={ODS_MIME_TYPE}>ODS</option>
44-
<option value="parquet">Parquet</option>
45-
</React.Fragment>
46-
)}
41+
<option disabled={inMemoryEval} value={ODS_MIME_TYPE}>
42+
ODS
43+
</option>
44+
<option disabled={inMemoryEval} value="parquet">
45+
Parquet
46+
</option>
47+
<option disabled={inMemoryEval} value="orc">
48+
ORC
49+
</option>
4750
<option value="application/json">JSON</option>
4851
<option value="application/jsonlines">
4952
Newline-delimited JSON

ui/panels/FilePanel.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ export function FilePanelDetails({
6565
/>
6666
</div>
6767
<ContentTypePicker
68-
inMemoryEval={MODE !== 'browser'}
68+
inMemoryEval={MODE === 'browser'}
6969
value={panel.file.contentTypeInfo}
7070
onChange={(cti: { type: string; customLineRegexp: string }) => {
7171
panel.file.contentTypeInfo = cti;

0 commit comments

Comments
 (0)