diff --git a/pkg/handlers/handlers.go b/pkg/handlers/handlers.go index 0a5b0e744b2c..1a27fc6b5476 100644 --- a/pkg/handlers/handlers.go +++ b/pkg/handlers/handlers.go @@ -3,6 +3,7 @@ package handlers import ( "archive/zip" "bufio" + "bytes" "context" "errors" "fmt" @@ -11,6 +12,7 @@ import ( "github.com/gabriel-vasile/mimetype" "github.com/mholt/archives" + "google.golang.org/protobuf/proto" logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context" "github.com/trufflesecurity/trufflehog/v3/pkg/feature" @@ -405,6 +407,8 @@ func handleChunksWithError( chunkSkel *sources.Chunk, reporter sources.ChunkReporter, ) error { + var linesConsumed int64 + for { select { case dataOrErr, ok := <-dataErrChan: @@ -422,7 +426,13 @@ func handleChunksWithError( } if len(dataOrErr.Data) > 0 { chunk := *chunkSkel + if chunk.SourceMetadata != nil { + if cloned, ok := proto.Clone(chunk.SourceMetadata).(*source_metadatapb.MetaData); ok { + chunk.SourceMetadata = cloned + } + } chunk.Data = dataOrErr.Data + linesConsumed = updateFilesystemLineMetadata(&chunk, linesConsumed) if err := reporter.ChunkOk(ctx, chunk); err != nil { return fmt.Errorf("error reporting chunk: %w", err) } @@ -433,6 +443,38 @@ func handleChunksWithError( } } +// updateFilesystemLineMetadata sets the 1-based starting line for filesystem chunks and +// updates the running total of lines consumed so subsequent chunks can be +// correctly anchored. Only the unique portion of the chunk (excluding the peek +// overlap) contributes to the running count so that lines aren't double counted. +// +// This relies on HandleFile's default chunk reader, which emits chunks that +// contain DefaultChunkSize bytes of unique data followed by a DefaultPeekSize +// overlap with the next chunk. +func updateFilesystemLineMetadata(chunk *sources.Chunk, linesConsumed int64) int64 { + if chunk.SourceMetadata == nil { + return linesConsumed + } + fsMeta := chunk.SourceMetadata.GetFilesystem() + if fsMeta == nil { + return linesConsumed + } + + fsMeta.Line = linesConsumed + 1 + + data := chunk.Data + if len(data) == 0 { + return linesConsumed + } + + uniqueLen := len(data) + if uniqueLen > sources.DefaultChunkSize { + uniqueLen = sources.DefaultChunkSize + } + + return linesConsumed + int64(bytes.Count(data[:uniqueLen], []byte("\n"))) +} + // isFatal determines whether the given error is a fatal error that should // terminate processing the current file, or a non-critical error that can be logged and ignored. // "Fatal" errors include context cancellation, deadline exceeded, and the diff --git a/pkg/handlers/handlers_test.go b/pkg/handlers/handlers_test.go index 87908de59a0b..32da972bb512 100644 --- a/pkg/handlers/handlers_test.go +++ b/pkg/handlers/handlers_test.go @@ -17,9 +17,12 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" diskbufferreader "github.com/trufflesecurity/disk-buffer-reader" "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/source_metadatapb" + "github.com/trufflesecurity/trufflehog/v3/pkg/pb/sourcespb" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" ) @@ -153,6 +156,53 @@ func BenchmarkHandleFile(b *testing.B) { } } +func TestHandleChunksWithErrorSetsFilesystemLine(t *testing.T) { + chunkCh := make(chan *sources.Chunk, 2) + reporter := sources.ChanReporter{Ch: chunkCh} + + chunkSkel := &sources.Chunk{ + SourceType: sourcespb.SourceType_SOURCE_TYPE_FILESYSTEM, + SourceMetadata: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Filesystem{ + Filesystem: &source_metadatapb.Filesystem{File: "test.txt"}, + }, + }, + } + + chunkSize := sources.DefaultChunkSize + peekSize := sources.DefaultPeekSize + + chunkOneMain := bytes.Repeat([]byte("a\n"), chunkSize/2) + chunkOnePeek := bytes.Repeat([]byte("p\n"), peekSize/2) + chunkOne := append(chunkOneMain, chunkOnePeek...) + + chunkTwo := bytes.Repeat([]byte("b\n"), 10) + + dataErrChan := make(chan DataOrErr, 2) + dataErrChan <- DataOrErr{Data: chunkOne} + dataErrChan <- DataOrErr{Data: chunkTwo} + close(dataErrChan) + + require.NoError(t, handleChunksWithError(context.Background(), dataErrChan, chunkSkel, reporter)) + + close(chunkCh) + var chunks []*sources.Chunk + for ch := range chunkCh { + chunks = append(chunks, ch) + } + + require.Len(t, chunks, 2) + + firstMeta := chunks[0].SourceMetadata.GetFilesystem() + require.NotNil(t, firstMeta) + require.Equal(t, int64(1), firstMeta.GetLine()) + + linesInFirstChunk := int64(bytes.Count(chunkOne[:chunkSize], []byte("\n"))) + secondMeta := chunks[1].SourceMetadata.GetFilesystem() + require.NotNil(t, secondMeta) + require.Equal(t, linesInFirstChunk+1, secondMeta.GetLine()) +} + func TestSkipArchive(t *testing.T) { file, err := os.Open("testdata/test.tgz") assert.Nil(t, err)