diff --git a/embedding/commentfilter/config.go b/embedding/commentfilter/config.go index 0160051..f900b55 100644 --- a/embedding/commentfilter/config.go +++ b/embedding/commentfilter/config.go @@ -29,8 +29,8 @@ const ( var filtersByExtension = map[string]filterEntry{ // Java/Kotlin ".java": filterConfig(MarkerCommentFilter{Syntax: javaSyntax}, allModes), - ".kt": filterConfig(MarkerCommentFilter{Syntax: javaSyntax}, allModes), - ".kts": filterConfig(MarkerCommentFilter{Syntax: javaSyntax}, allModes), + ".kt": filterConfig(KotlinCommentFilter{}, allModes), + ".kts": filterConfig(KotlinCommentFilter{}, allModes), ".groovy": filterConfig(MarkerCommentFilter{Syntax: javaSyntax}, allModes), // C# diff --git a/embedding/commentfilter/filter_test.go b/embedding/commentfilter/filter_test.go index c7fd149..eefbe8c 100644 --- a/embedding/commentfilter/filter_test.go +++ b/embedding/commentfilter/filter_test.go @@ -119,6 +119,144 @@ var _ = Describe("Comment filter", func() { }) }) + Describe("Kotlin", func() { + It("should keep all comments", func() { + lines := []string{ + "/** API docs. */", + "/* implementation note */", + "val value = 1 // inline note", + } + + assertFiltered("Sample.kt", RetainAll, lines, lines) + }) + + It("should strip comments without treating raw string text as comments", func() { + lines := []string{ + "/* outer /* nested */ still comment */", + "val text = \"\"\"", + " This is not a /* comment */.", + " This is not a // comment either.", + " This removes a real comment: ${render(/* real raw argument */ value)}", + "\"\"\"", + "val message = \"value = ${render(/* real argument */ value)}\"", + } + + expected := []string{ + "val text = \"\"\"", + " This is not a /* comment */.", + " This is not a // comment either.", + " This removes a real comment: ${render( value)}", + "\"\"\"", + "val message = \"value = ${render( value)}\"", + } + + assertFiltered("Sample.kt", RetainNone, lines, expected) + }) + + It("should continue raw string interpolation after line comments", func() { + lines := []string{ + "val text = \"\"\"", + " ${render(", + " value, // real line comment", + " /* real block comment */ nextValue", + " )}", + " Keep // raw text.", + "\"\"\"", + } + + expected := []string{ + "val text = \"\"\"", + " ${render(", + " value, ", + " nextValue", + " )}", + " Keep // raw text.", + "\"\"\"", + } + + assertFiltered("Sample.kt", RetainNone, lines, expected) + }) + + It("should keep KDoc comments", func() { + lines := []string{ + "/** API docs. */", + "/* implementation note */", + "val value = 1 // inline note", + } + + expected := []string{ + "/** API docs. */", + "val value = 1 ", + } + + assertFiltered("Sample.kt", RetainDocumentation, lines, expected) + }) + + It("should keep regular comments", func() { + lines := []string{ + "/** API docs. */", + "/* implementation note */", + "val value = 1 // inline note", + } + + expected := []string{ + "/* implementation note */", + "val value = 1 // inline note", + } + + assertFiltered("Sample.kt", RetainRegular, lines, expected) + }) + + It("should keep inline comments", func() { + lines := []string{ + "/** API docs. */", + "/* implementation note */", + "val value = 1 // inline note", + } + + expected := []string{ + "val value = 1 // inline note", + } + + assertFiltered("Sample.kt", RetainInline, lines, expected) + }) + + It("should keep nested block comments", func() { + lines := []string{ + "val before = 1", + "/* outer", + " /* nested */", + " still outer */", + "val after = 2 // inline", + } + + expected := []string{ + "val before = 1", + "/* outer", + " /* nested */", + " still outer */", + "val after = 2 ", + } + + assertFiltered("Sample.kts", RetainBlock, lines, expected) + }) + + It("should close empty documentation block comments", func() { + lines := []string{ + "/**/", + "val a = 1", + "val b = 2 /**/ val c = 3", + } + + expected := []string{ + "val a = 1", + "val b = 2 val c = 3", + } + + assertFiltered("Sample.kt", RetainNone, lines, expected) + }) + }) + Describe("JavaScript and TypeScript", func() { It("should strip comments without treating template literals as comments", func() { lines := []string{ diff --git a/embedding/commentfilter/kotlin_filter.go b/embedding/commentfilter/kotlin_filter.go new file mode 100644 index 0000000..4990306 --- /dev/null +++ b/embedding/commentfilter/kotlin_filter.go @@ -0,0 +1,361 @@ +// Copyright 2026, TeamDev. All rights reserved. +// +// Redistribution and use in source and/or binary forms, with or without +// modification, must retain the above copyright notice and the following +// disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package commentfilter + +import "strings" + +const kotlinRawStringDelimiter = "\"\"\"" + +// KotlinCommentFilter filters Kotlin comments while preserving Kotlin string forms. +type KotlinCommentFilter struct{} + +// kotlinState tracks Kotlin lexical state that can span source lines. +type kotlinState struct { + // blockDepth is the current nested block comment depth. + blockDepth int + + // blockKeep reports whether the active block comment should be retained. + blockKeep bool + + // rawString reports whether scanning is inside a raw triple-quoted string. + rawString bool + + // rawInterpolationDepth is the active brace depth of a raw-string interpolation. + rawInterpolationDepth int +} + +// kotlinLineFilter filters one Kotlin source line. +type kotlinLineFilter struct { + // line is the source line being filtered. + line string + + // mode selects which comments to retain. + mode Mode + + // state tracks Kotlin constructs across lines. + state *kotlinState + + // result accumulates the filtered source line. + result strings.Builder + + // position is the current byte index in line. + position int + + // hadComment reports whether the line contained a recognized comment. + hadComment bool +} + +// Filter removes or preserves Kotlin comments according to mode. +// +// Parameters: +// lines - provides Kotlin source lines. +// mode - selects comments to retain. +// +// Returns filtered source lines. +func (KotlinCommentFilter) Filter(lines []string, mode Mode) []string { + var filtered []string + state := kotlinState{} + for _, line := range lines { + filteredLine, hadComment := filterKotlinLine(line, mode, &state) + if hadComment && strings.TrimSpace(filteredLine) == "" { + continue + } + filtered = append(filtered, filteredLine) + } + + return filtered +} + +// filterKotlinLine removes or preserves recognized Kotlin comments from one line. +func filterKotlinLine(line string, mode Mode, state *kotlinState) (string, bool) { + filter := kotlinLineFilter{ + line: line, + mode: mode, + state: state, + } + + return filter.filterLine() +} + +// filterLine walks the current line until it reaches its end or a line comment. +func (f *kotlinLineFilter) filterLine() (string, bool) { + for f.position < len(f.line) { + if f.consumeActiveBlock() { + continue + } + if f.consumeRawInterpolation() { + continue + } + if f.consumeRawString() { + continue + } + if f.consumeString() { + continue + } + if consumed, stop := f.consumeComment(); consumed { + if stop { + break + } + + continue + } + f.consumeCodeByte() + } + + return f.result.String(), f.hadComment +} + +// consumeActiveBlock consumes a possibly nested Kotlin block comment. +func (f *kotlinLineFilter) consumeActiveBlock() bool { + if f.state.blockDepth == 0 { + return false + } + f.hadComment = true + for f.position < len(f.line) { + switch { + case strings.HasPrefix(f.line[f.position:], cStyleBlockCommentStart): + f.writeBlockText(cStyleBlockCommentStart) + f.state.blockDepth++ + f.position += len(cStyleBlockCommentStart) + case strings.HasPrefix(f.line[f.position:], cStyleBlockCommentEnd): + f.writeBlockText(cStyleBlockCommentEnd) + f.state.blockDepth-- + f.position += len(cStyleBlockCommentEnd) + if f.state.blockDepth == 0 { + return true + } + default: + if f.state.blockKeep { + f.result.WriteByte(f.line[f.position]) + } + f.position++ + } + } + + return true +} + +// consumeRawString copies Kotlin raw-string text and filters `${...}` interpolation code. +// +// It treats the first three quotes in a run of four or more quotes as the raw-string delimiter. +func (f *kotlinLineFilter) consumeRawString() bool { + if !f.state.rawString && !strings.HasPrefix(f.line[f.position:], kotlinRawStringDelimiter) { + return false + } + if !f.state.rawString { + f.state.rawString = true + f.result.WriteString(kotlinRawStringDelimiter) + f.position += len(kotlinRawStringDelimiter) + } + for f.position < len(f.line) { + switch { + case strings.HasPrefix(f.line[f.position:], kotlinRawStringDelimiter): + f.result.WriteString(kotlinRawStringDelimiter) + f.position += len(kotlinRawStringDelimiter) + f.state.rawString = false + + return true + case strings.HasPrefix(f.line[f.position:], "${"): + f.result.WriteString("${") + f.position += len("${") + f.state.rawString = false + f.state.rawInterpolationDepth = 1 + f.consumeRawInterpolation() + if f.state.rawInterpolationDepth > 0 { + return true + } + default: + f.consumeCodeByte() + } + } + + return true +} + +// consumeRawInterpolation resumes Kotlin expression scanning inside a raw-string interpolation. +func (f *kotlinLineFilter) consumeRawInterpolation() bool { + if f.state.rawInterpolationDepth == 0 { + return false + } + f.consumeInterpolationDepth(&f.state.rawInterpolationDepth) + if f.state.rawInterpolationDepth == 0 { + f.state.rawString = true + } + + return true +} + +// consumeString copies Kotlin string and character literals, filtering interpolated expressions. +func (f *kotlinLineFilter) consumeString() bool { + if f.position >= len(f.line) { + return false + } + switch f.line[f.position] { + case '"': + f.consumeQuotedString() + + return true + case '\'': + quoteEnd := quotedSegmentEnd(f.line, f.position, "'") + f.result.WriteString(f.line[f.position:quoteEnd]) + f.position = quoteEnd + + return true + default: + return false + } +} + +// consumeQuotedString copies a Kotlin quoted string and filters comments inside `${...}`. +func (f *kotlinLineFilter) consumeQuotedString() { + f.result.WriteByte(f.line[f.position]) + f.position++ + for f.position < len(f.line) { + switch { + case f.line[f.position] == '\\': + f.writeEscapedByte() + case f.line[f.position] == '"': + f.result.WriteByte(f.line[f.position]) + f.position++ + + return + case strings.HasPrefix(f.line[f.position:], "${"): + f.result.WriteString("${") + f.position += len("${") + f.consumeInterpolation() + default: + f.consumeCodeByte() + } + } +} + +// consumeInterpolation filters comments inside a Kotlin string interpolation expression. +func (f *kotlinLineFilter) consumeInterpolation() { + depth := 1 + f.consumeInterpolationDepth(&depth) +} + +// consumeInterpolationDepth filters comments inside interpolation code +// until depth closes or line ends. +func (f *kotlinLineFilter) consumeInterpolationDepth(depth *int) { + for f.position < len(f.line) { + if f.consumeActiveBlock() { + continue + } + if f.consumeRawString() { + continue + } + if f.consumeString() { + continue + } + if consumed, stop := f.consumeComment(); consumed { + if stop { + return + } + + continue + } + var done bool + *depth, done = f.consumeInterpolationCode(*depth) + if done { + *depth = 0 + + return + } + } +} + +// consumeInterpolationCode copies expression code and updates interpolation brace depth. +func (f *kotlinLineFilter) consumeInterpolationCode(depth int) (int, bool) { + switch f.line[f.position] { + case '{': + depth++ + f.consumeCodeByte() + + return depth, false + case '}': + depth-- + f.consumeCodeByte() + + return depth, depth == 0 + default: + f.consumeCodeByte() + + return depth, false + } +} + +// consumeComment consumes a Kotlin comment and reports whether it ended the line. +func (f *kotlinLineFilter) consumeComment() (bool, bool) { + if strings.HasPrefix(f.line[f.position:], cStyleDocCommentStart) { + f.startBlockComment(f.mode == RetainDocumentation) + + return true, false + } + if strings.HasPrefix(f.line[f.position:], cStyleBlockCommentStart) { + f.startBlockComment(f.mode == RetainBlock || f.mode == RetainRegular) + + return true, false + } + if strings.HasPrefix(f.line[f.position:], "//") { + f.hadComment = true + if f.mode == RetainInline || f.mode == RetainRegular { + f.result.WriteString(f.line[f.position:]) + } + f.position = len(f.line) + + return true, true + } + + return false, false +} + +// startBlockComment starts a Kotlin block comment with nesting depth one. +func (f *kotlinLineFilter) startBlockComment(keep bool) { + f.hadComment = true + f.state.blockDepth = 1 + f.state.blockKeep = keep + if keep { + f.result.WriteString(cStyleBlockCommentStart) + } + f.position += len(cStyleBlockCommentStart) +} + +// writeBlockText appends block comment text when the active mode retains it. +func (f *kotlinLineFilter) writeBlockText(text string) { + if f.state.blockKeep { + f.result.WriteString(text) + } +} + +// writeEscapedByte copies an escaped byte pair from a quoted string. +func (f *kotlinLineFilter) writeEscapedByte() { + f.result.WriteByte(f.line[f.position]) + f.position++ + if f.position < len(f.line) { + f.result.WriteByte(f.line[f.position]) + f.position++ + } +} + +// consumeCodeByte copies one source byte. +func (f *kotlinLineFilter) consumeCodeByte() { + f.result.WriteByte(f.line[f.position]) + f.position++ +} diff --git a/embedding/commentfilter/visual_basic.go b/embedding/commentfilter/visual_basic_filter.go similarity index 100% rename from embedding/commentfilter/visual_basic.go rename to embedding/commentfilter/visual_basic_filter.go