diff --git a/embedding/commentfilter/config.go b/embedding/commentfilter/config.go index f900b55..e4df86f 100644 --- a/embedding/commentfilter/config.go +++ b/embedding/commentfilter/config.go @@ -22,6 +22,7 @@ const ( cStyleBlockCommentStart = "/*" cStyleBlockCommentEnd = "*/" cStyleDocCommentStart = "/**" + javaTextBlockDelimiter = "\"\"\"" jsQuoteChars = "\"'`" ) @@ -98,6 +99,7 @@ var javaSyntax = CommentMarker{ Documentation: DocumentationMarker{ Block: []BlockMarker{{Start: cStyleDocCommentStart, End: cStyleBlockCommentEnd}}, }, + TextBlocks: []string{javaTextBlockDelimiter}, QuoteChars: "\"'", } diff --git a/embedding/commentfilter/filter_test.go b/embedding/commentfilter/filter_test.go index eefbe8c..8c23506 100644 --- a/embedding/commentfilter/filter_test.go +++ b/embedding/commentfilter/filter_test.go @@ -117,6 +117,49 @@ var _ = Describe("Comment filter", func() { assertFiltered("Api.java", RetainRegular, lines, expected) }) + + It("should strip comments without treating text block content as comments", func() { + lines := []string{ + "// header comment", + "String help = \"\"\"", + " Keep this // text.", + " Keep this /* text */ too.", + " \"\"\";", + "String value = \"not a // comment\"; // inline comment", + } + + expected := []string{ + "String help = \"\"\"", + " Keep this // text.", + " Keep this /* text */ too.", + " \"\"\";", + "String value = \"not a // comment\"; ", + } + + assertFiltered("Api.java", RetainNone, lines, expected) + }) + + It("should not close text blocks on escaped triple quotes", func() { + lines := []string{ + "String help = \"\"\"", + ` Quote: \"""`, + ` Escaped quote: \"`, + " Keep this // text.", + " \"\"\";", + "String value = \"kept\"; // real comment", + } + + expected := []string{ + "String help = \"\"\"", + ` Quote: \"""`, + ` Escaped quote: \"`, + " Keep this // text.", + " \"\"\";", + "String value = \"kept\"; ", + } + + assertFiltered("Api.java", RetainNone, lines, expected) + }) }) Describe("Kotlin", func() { diff --git a/embedding/commentfilter/marker_comment_filter.go b/embedding/commentfilter/marker_comment_filter.go index 8f49222..d764aa5 100644 --- a/embedding/commentfilter/marker_comment_filter.go +++ b/embedding/commentfilter/marker_comment_filter.go @@ -49,6 +49,9 @@ type CommentMarker struct { // Documentation contains API documentation comment markers. Documentation DocumentationMarker + // TextBlocks contains delimiters that open and close multi-line text literals. + TextBlocks []string + // QuoteChars contains characters that open and close quoted strings. QuoteChars string } @@ -59,7 +62,7 @@ type MarkerCommentFilter struct { Syntax CommentMarker } -// blockState tracks an active block comment across source lines. +// blockState tracks active multi-line lexical constructs across source lines. type blockState struct { // active reports whether scanning is inside a block comment. active bool @@ -69,6 +72,12 @@ type blockState struct { // keep reports whether the active comment should be retained. keep bool + + // textBlockActive reports whether scanning is inside a text block. + textBlockActive bool + + // textBlockDelimiter contains the marker that closes the active text block. + textBlockDelimiter string } // markerLineFilter tracks lexical comment filtering state for one source line. @@ -82,7 +91,7 @@ type markerLineFilter struct { // mode selects which comments to retain. mode Mode - // state tracks block comments across lines. + // state tracks multi-line lexical constructs across lines. state *blockState // result accumulates the filtered source line. @@ -138,6 +147,12 @@ func (f *markerLineFilter) filterLine() (string, bool) { if f.consumeActiveBlock() { continue } + if f.consumeActiveTextBlock() { + continue + } + if f.consumeTextBlockStart() { + continue + } if f.consumeQuotedSegment() { continue } @@ -179,6 +194,57 @@ func (f *markerLineFilter) consumeActiveBlock() bool { return true } +// consumeActiveTextBlock copies text block content until the closing delimiter. +func (f *markerLineFilter) consumeActiveTextBlock() bool { + if !f.state.textBlockActive { + return false + } + endPosition, found := textBlockEnd(f.line, f.position, f.state.textBlockDelimiter) + if !found { + f.result.WriteString(f.line[f.position:]) + f.position = len(f.line) + + return true + } + f.result.WriteString(f.line[f.position:endPosition]) + f.position = endPosition + f.state.textBlockActive = false + f.state.textBlockDelimiter = "" + + return true +} + +// textBlockEnd returns the end offset of a text block close delimiter. +func textBlockEnd(line string, position int, delimiter string) (int, bool) { + for cursor := position; cursor < len(line); { + if line[cursor] == '\\' { + cursor += 2 + + continue + } + if strings.HasPrefix(line[cursor:], delimiter) { + return cursor + len(delimiter), true + } + cursor++ + } + + return len(line), false +} + +// consumeTextBlockStart starts a configured text block literal. +func (f *markerLineFilter) consumeTextBlockStart() bool { + delimiter, found := prefixFrom(f.line, f.position, f.filter.Syntax.TextBlocks) + if !found { + return false + } + f.result.WriteString(delimiter) + f.position += len(delimiter) + f.state.textBlockActive = true + f.state.textBlockDelimiter = delimiter + + return true +} + // consumeQuotedSegment copies a quoted segment without scanning comment markers inside it. func (f *markerLineFilter) consumeQuotedSegment() bool { quoteEnd := quotedSegmentEnd(f.line, f.position, f.filter.Syntax.QuoteChars) @@ -264,13 +330,20 @@ func (f *markerLineFilter) consumeCodeByte() { // prefixAt reports whether one of the given prefixes starts at the position. func prefixAt(line string, position int, prefixes []string) bool { + _, found := prefixFrom(line, position, prefixes) + + return found +} + +// prefixFrom returns the prefix starting at position when one exists. +func prefixFrom(line string, position int, prefixes []string) (string, bool) { for _, prefix := range prefixes { if strings.HasPrefix(line[position:], prefix) { - return true + return prefix, true } } - return false + return "", false } // blockAt reports whether one of the given block markers starts at the position.