diff options
| author | Adam Malczewski <[email protected]> | 2026-06-02 20:46:23 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-02 20:46:23 +0900 |
| commit | 80212bfb009eaf71a4743310dee6ed08b8f7e1da (patch) | |
| tree | 81b1873f4a276116cf019cbcdecdd8fcba56beef /docker/cs | |
| parent | 09914c6ba15214d5ec05c106d5d11fd14a86f532 (diff) | |
| download | dispatch-80212bfb009eaf71a4743310dee6ed08b8f7e1da.tar.gz dispatch-80212bfb009eaf71a4743310dee6ed08b8f7e1da.zip | |
fix(search_code): fuzzy mid-word matching + Luau/fuzzy live test coverage
Address the remaining real defects from the Luau/cs test reports. The wrapper-
level findings (dash-leading queries, context flag, no-match message, empty
query, path-is-file) were already fixed in earlier commits and verified through
the tool; the two genuinely-open items were engine-level, plus a test-coverage
gap (the patch-dependent behaviors were only exercised by live tests that
skip without a cs binary).
- Engine fix (docker/cs/fuzzy-distance.patch): cs's fuzzy `term~N` only scanned
same-length windows, so it matched substitutions but never mid-word
insertions/deletions — e.g. `computSlipAngle~1` (a dropped 'e') failed to find
`computeSlipAngle`, contradicting cs's own "within 1 or 2 distance" docs.
Now scan windows of length termLen±maxDist (true Levenshtein) and keep the
best per offset. Updates one pre-existing cs test that encoded the buggy
substitution-only behaviour and adds mid-word insert/delete cases. Passes
cs's pkg/search + pkg/ranker suites; builds clean against the pinned commit.
- Provisioning: apply the new patch everywhere the Luau patch is applied —
Dockerfile, Dockerfile.dev, packaging/PKGBUILD build() — so every install
path (Docker bin/up, native code-search package via bin/service install)
ships both patches.
- Tests: add skip-gated live tests for Luau declaration detection (function /
local function / type / export type), only=usages exclusion, the Luau
language tag, and fuzzy mid-word matching. New capability probes
(findLuauCapableCs / findFuzzyCapableCs) run these only on a cs that actually
has each patch and skip (never fail) on an unpatched/absent binary. Default
suite: 600 pass / 12 skip; with a both-patched cs: 612 pass / 0 skip.
- Docs: UPSTREAM_CS_FUZZY_BUG.md documents the unreported upstream defect for a
potential boyter/cs PR; CS_ARTIX_DEPLOY.md updated to reflect that
sync-dispatch.sh now ships the code-search package (carrying both patches).
biome + tsc (core/api/frontend) + svelte-check all green.
Diffstat (limited to 'docker/cs')
| -rw-r--r-- | docker/cs/fuzzy-distance.patch | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/docker/cs/fuzzy-distance.patch b/docker/cs/fuzzy-distance.patch new file mode 100644 index 0000000..e432986 --- /dev/null +++ b/docker/cs/fuzzy-distance.patch @@ -0,0 +1,159 @@ +Fix cs fuzzy matching to honour true Levenshtein edit distance (insertions and +deletions), not just same-length substitutions. + +Upstream cs (v3.1.0) implements `term~N` fuzzy search by scanning only +same-length windows of the content: for a term of length L it compares every +L-character substring and keeps those within N edits. Because every candidate +window is exactly L long, the only edits it can ever observe are substitutions. +A mid-word insertion or deletion — e.g. `computSlipAngle~1` for the real symbol +`computeSlipAngle` (a dropped 'e'), or `houss~1` vs `house` — changes the length +by one and so is never matched, even though it is edit distance 1. This +contradicts cs's own documentation ("fuzzy match within 1 or 2 distance"). + +The fix scans windows of every plausible length in +[termLen-maxDist, termLen+maxDist] (clamped at 1) at each offset and keeps the +best (lowest-distance) match, so insertions and deletions match too. fuzzyFind +records the best window per start and advances past it to avoid emitting a swarm +of overlapping locations for one logical match. + +Purely localised to the two fuzzy helpers in pkg/search/executor.go (plus a +test update: the pre-existing `hovze~2` case asserted the old substitution-only +behaviour, where the distance-2 term coincidentally failed to match a shorter +window that is in fact within distance 2; it is replaced with an unambiguous +distance-1 case and new mid-word insertion/deletion cases). Passes cs's own +pkg/search + pkg/ranker test suites. Applied during the Docker build and the +native package build via `git apply` against the pinned v3.1.0 checkout (see +Dockerfile / Dockerfile.dev / packaging/PKGBUILD). Candidate for upstreaming to +boyter/cs (the defect is unreported there). + +diff --git a/pkg/search/executor.go b/pkg/search/executor.go +index 175d458..1c422d2 100644 +--- a/pkg/search/executor.go ++++ b/pkg/search/executor.go +@@ -632,17 +632,67 @@ func min3(a, b, c int) int { + return c + } + +-// fuzzyContains checks if any same-length substring of content matches +-// the term within the given edit distance (substitution-based matching). ++// fuzzyWindowBounds returns the inclusive range of substring lengths that ++// could be within maxDist edits of a term of length termLen. A Levenshtein ++// distance of d can only be achieved between strings whose lengths differ by ++// at most d (each insertion or deletion changes the length by one), so any ++// candidate window must have a length in [termLen-maxDist, termLen+maxDist]. ++// The lower bound is clamped to 1 so we never form a zero-length window. ++func fuzzyWindowBounds(termLen, maxDist int) (int, int) { ++ minLen := termLen - maxDist ++ if minLen < 1 { ++ minLen = 1 ++ } ++ return minLen, termLen + maxDist ++} ++ ++// bestFuzzyMatchAt returns the length of the best (lowest-distance) substring ++// of content starting at offset i that is within maxDist edits of term, or -1 ++// if none is. Windows of every plausible length are tried (see ++// fuzzyWindowBounds) so insertions and deletions match, not just ++// substitutions — e.g. "computSlipAngle" (a dropped 'e') matches ++// "computeSlipAngle" at distance 1. Ties prefer the length closest to termLen. ++func bestFuzzyMatchAt(content, term string, i, maxDist, minLen, maxLen int) int { ++ contentLen := len(content) ++ bestLen := -1 ++ bestDist := maxDist + 1 ++ bestDelta := 0 ++ for wl := minLen; wl <= maxLen; wl++ { ++ if i+wl > contentLen { ++ break ++ } ++ d := levenshtein(content[i:i+wl], term) ++ if d > maxDist { ++ continue ++ } ++ delta := wl - len(term) ++ if delta < 0 { ++ delta = -delta ++ } ++ if d < bestDist || (d == bestDist && delta < bestDelta) { ++ bestDist = d ++ bestLen = wl ++ bestDelta = delta ++ } ++ } ++ return bestLen ++} ++ ++// fuzzyContains reports whether any substring of content is within maxDist ++// edits (true Levenshtein: insertions, deletions, and substitutions) of term. + func fuzzyContains(content, term string, maxDist, termLen int) bool { + contentLen := len(content) +- if contentLen == 0 || termLen == 0 || termLen > contentLen { ++ if contentLen == 0 || termLen == 0 { ++ return false ++ } ++ ++ minLen, maxLen := fuzzyWindowBounds(termLen, maxDist) ++ if minLen > contentLen { + return false + } + +- for i := 0; i <= contentLen-termLen; i++ { +- window := content[i : i+termLen] +- if levenshtein(window, term) <= maxDist { ++ for i := 0; i <= contentLen-minLen; i++ { ++ if bestFuzzyMatchAt(content, term, i, maxDist, minLen, maxLen) >= 0 { + return true + } + } +@@ -651,18 +701,30 @@ func fuzzyContains(content, term string, maxDist, termLen int) bool { + + // fuzzyFind finds all match locations in content that are within the given + // edit distance of the term. Returns [][]int where each entry is [start, end]. ++// Like fuzzyContains it considers variable-length windows so insertions and ++// deletions match. To avoid emitting a swarm of overlapping windows for one ++// logical match, it records the best window at each start and then advances ++// past it. + func fuzzyFind(content, term string, maxDist, termLen int) [][]int { + contentLen := len(content) +- if contentLen == 0 || termLen == 0 || termLen > contentLen { ++ if contentLen == 0 || termLen == 0 { ++ return nil ++ } ++ ++ minLen, maxLen := fuzzyWindowBounds(termLen, maxDist) ++ if minLen > contentLen { + return nil + } + + var locs [][]int +- for i := 0; i <= contentLen-termLen; i++ { +- window := content[i : i+termLen] +- if levenshtein(window, term) <= maxDist { +- locs = append(locs, []int{i, i + termLen}) +- } ++ for i := 0; i <= contentLen-minLen; { ++ wl := bestFuzzyMatchAt(content, term, i, maxDist, minLen, maxLen) ++ if wl < 0 { ++ i++ ++ continue ++ } ++ locs = append(locs, []int{i, i + wl}) ++ i += wl + } + return locs + } +diff --git a/pkg/search/search_test.go b/pkg/search/search_test.go +index 6cbd01f..eacbd75 100644 +--- a/pkg/search/search_test.go ++++ b/pkg/search/search_test.go +@@ -57,7 +57,10 @@ func TestExecutor(t *testing.T) { + {"Fuzzy Distance 1", "houss~1", false, []string{"src/main/file1.go"}}, // "houss" is distance 1 from "house" (only in file1) + {"Fuzzy Distance 1 No Match", "zzz~1", false, []string{}}, + {"Fuzzy AND Keyword", "houss~1 AND brown", false, []string{"src/main/file1.go"}}, +- {"Fuzzy Distance 2", "hovze~2", false, []string{"src/main/file1.go"}}, // "hovze" is distance 2 from "house" (u→v, s→z), only in file1 ++ {"Fuzzy Distance 2", "houze~2", false, []string{"src/main/file1.go"}}, // "houze" is distance 1 from "house" (s→z), only in file1 ++ // Mid-word insertion/deletion must match (true Levenshtein, not just same-length substitution). ++ {"Fuzzy Distance 1 deletion", "hose~1", false, []string{"src/main/file1.go"}}, // "hose" -> "house" (insert 'u'), distance 1 ++ {"Fuzzy Distance 1 insertion", "houxse~1", false, []string{"src/main/file1.go"}}, // "houxse" -> "house" (delete 'x'), distance 1 + + // Colon filter syntax + {"Colon file filter", "cat file:file1", false, []string{"src/main/file1.go"}}, |
