Skip to content

Commit fb492e2

Browse files
authored
copy languages package from Sourcegraph to Zoekt (#979)
We want Zoekt and Sourcegraph to use the same language package. In this PR we move the languages package from Sourcegraph to Zoekt, so that Zoekt can use it and Sourcegraph can import it. Notes: - Zoekt doesn't need to fetch content async which is why I added a little helper func `GetLanguagesFromContent` to make the call sites in Zoekt less awkward. - Sourcegraph's languages package always classified .cls files as Apex, while Zoekt did a content based check. With this PR we follow Zoekt's approach. Specifically, I removed .cls from `unsupportedByEnryExtensionToNameMap`. I added an additional unit test to cover this case. Test plan: I appended the test cases from the old Zoekt languages packages to the tests I copied over from Sourcegraph
1 parent 69b7ba7 commit fb492e2

File tree

11 files changed

+1063
-193
lines changed

11 files changed

+1063
-193
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ require (
5454
golang.org/x/sys v0.30.0
5555
google.golang.org/grpc v1.69.4
5656
google.golang.org/protobuf v1.36.3
57+
pgregory.net/rapid v1.2.0
5758
)
5859

5960
require (

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,4 +576,6 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh
576576
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
577577
mvdan.cc/gofumpt v0.4.0 h1:JVf4NN1mIpHogBj7ABpgOyZc65/UUOkKQFkoURsz4MM=
578578
mvdan.cc/gofumpt v0.4.0/go.mod h1:PljLOHDeZqgS8opHRKLzp2It2VBuSdteAgqUfzMTxlQ=
579+
pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk=
580+
pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
579581
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

index/shard_builder.go

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,15 @@ import (
2323
"net/url"
2424
"os"
2525
"path/filepath"
26+
"slices"
2627
"sort"
2728
"strings"
2829
"text/template"
2930
"time"
3031
"unicode/utf8"
3132

32-
"slices"
33-
3433
"github.com/sourcegraph/zoekt"
35-
"github.com/sourcegraph/zoekt/internal/languages"
34+
"github.com/sourcegraph/zoekt/languages"
3635
)
3736

3837
var _ = log.Println
@@ -404,13 +403,18 @@ func DetermineLanguageIfUnknown(doc *Document) {
404403
return
405404
}
406405

407-
if doc.SkipReason != SkipReasonNone {
408-
// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
409-
// In this case, we just guess the language based on file name to avoid examining the contents.
410-
// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
411-
doc.Language = languages.GetLanguage(doc.Name, nil)
412-
} else {
413-
doc.Language = languages.GetLanguage(doc.Name, doc.Content)
406+
// If this document has been skipped (doc.SkipReason != SkipReasonNone), it's
407+
// likely very large, or it's a non-code file like binary. In this case, we just
408+
// guess the language based on the file name to avoid examining the contents.
409+
// Note: passing nil content is allowed by the go-enry contract (the underlying
410+
// library we use here).
411+
var content []byte
412+
if doc.SkipReason == SkipReasonNone {
413+
content = doc.Content
414+
}
415+
langs := languages.GetLanguagesFromContent(doc.Name, content)
416+
if len(langs) > 0 {
417+
doc.Language = langs[0]
414418
}
415419
}
416420

internal/languages/language.go

Lines changed: 0 additions & 74 deletions
This file was deleted.

internal/languages/language_test.go

Lines changed: 0 additions & 107 deletions
This file was deleted.

languages/enry_vendored.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package languages
2+
3+
import "strings"
4+
5+
// This file contains private functions
6+
// vendored from the go-enry codebase.
7+
8+
// convertToAliasKey is vendored from go-enry to make sure
9+
// we're normalizing strings the same way.
10+
func convertToAliasKey(langName string) string {
11+
ak := strings.SplitN(langName, `,`, 2)[0]
12+
ak = strings.Replace(ak, ` `, `_`, -1)
13+
ak = strings.ToLower(ak)
14+
return ak
15+
}

0 commit comments

Comments
 (0)