// fuzzy-engine.jsx — shared fuzzy-matching library
//
// Promotes the string-scoring core out of matcher-engine.jsx so it can be
// reused by reconciliation, dedup, header search, entity pickers — anywhere
// catalog data is being compared by humans-as-typed.
//
// Algorithms (each independently scaled into 0..1, take the MAX so any one
// can rescue a hard case):
//
//   1. Token Jaccard           — word-set overlap; survives reordering
//   2. Trigram cosine          — character-level; survives mid-word drift
//   3. Double Metaphone        — phonetic; survives misspellings/accents
//   4. Damerau-Levenshtein     — edit-distance with adjacent-swap; short titles
//   5. Smith-Waterman local    — partial alignment; long subtitles
//   6. Soft-TFIDF              — token rarity weighted; "the" doesn't carry
//
// Plus upstream:
//   - normalize()              — mojibake-aware + ascii-fold + token strip
//   - blockingKey()            — metaphone[0:4] + first-trigram for N² control
//   - idPartialMatch()         — ISWC/IPI/ISRC prefix tolerance
//   - score()                  — composite score with all algos applied
//   - rank(query, list, getStr) — block + score + sort, with cutoff
//
// EXPORT: window.FuzzyEngine = { ... }
//
// matcher-engine.jsx keeps its own re-exports of the primitives it already
// uses; this file is the canonical home and adds the heavier algorithms.

(() => {
  // ─── normalization (mojibake + accent fold) ────────────────────────
  function normalize(s) {
    if (!s) return '';
    let v = String(s);
    if (window.MojibakeEngine) {
      try {
        const det = window.MojibakeEngine.detect(v);
        if (det && det.corrupted) {
          const dec = window.MojibakeEngine.decode(v);
          if (dec && dec.changed && dec.confidence >= 0.6) v = dec.output;
        }
      } catch (_) { /* defensive — engine may not be ready */ }
    }
    v = v.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
    return v.toLowerCase().replace(/[^a-z0-9]+/g, ' ').trim();
  }

  function tokens(s) { return normalize(s).split(' ').filter(Boolean); }

  // ─── Token Jaccard ─────────────────────────────────────────────────
  function jaccard(a, b) {
    const A = new Set(tokens(a));
    const B = new Set(tokens(b));
    if (!A.size || !B.size) return 0;
    let inter = 0;
    A.forEach(t => B.has(t) && inter++);
    return inter / new Set([...A, ...B]).size;
  }

  // ─── Trigram cosine ────────────────────────────────────────────────
  function trigrams(s) {
    const t = '  ' + normalize(s) + '  ';
    const m = new Map();
    for (let i = 0; i < t.length - 2; i++) {
      const g = t.slice(i, i + 3);
      m.set(g, (m.get(g) || 0) + 1);
    }
    return m;
  }
  function trigramCos(a, b) {
    const A = trigrams(a), B = trigrams(b);
    if (!A.size || !B.size) return 0;
    let dot = 0, nA = 0, nB = 0;
    for (const [g, v] of A) {
      nA += v * v;
      if (B.has(g)) dot += v * B.get(g);
    }
    for (const v of B.values()) nB += v * v;
    return dot / Math.sqrt(nA * nB || 1);
  }

  // ─── Double Metaphone (compact) ────────────────────────────────────
  function metaphone(s) {
    if (!s) return '';
    s = normalize(s).replace(/\s+/g, '');
    if (!s) return '';
    let out = '', i = 0;
    const ch = (k) => s[k] || '';
    const isVowel = (c) => 'aeiouy'.includes(c);
    if (/^(gn|kn|pn|wr|ps)/.test(s)) i = 1;
    if (s.startsWith('x')) { out += 's'; i = 1; }

    while (i < s.length && out.length < 8) {
      const c = ch(i), next = ch(i + 1), prev = ch(i - 1);
      switch (c) {
        case 'a': case 'e': case 'i': case 'o': case 'u':
          if (i === 0) out += 'a'; i++; break;
        case 'b': out += 'p'; i += (next === 'b') ? 2 : 1; break;
        case 'c':
          if (next === 'h') { out += (s.substr(i + 2, 2) === 'ar') ? 'k' : 'x'; i += 2; }
          else if ('iey'.includes(next)) { out += 's'; i += 1; }
          else { out += 'k'; i += (next === 'c' ? 2 : 1); } break;
        case 'd':
          if (next === 'g' && 'iey'.includes(ch(i + 2))) { out += 'j'; i += 3; }
          else { out += 't'; i += (next === 'd' ? 2 : 1); } break;
        case 'g':
          if (next === 'h') { i += 2; }
          else if (next === 'n') { out += 'n'; i += 2; }
          else if ('iey'.includes(next)) { out += 'j'; i++; }
          else { out += 'k'; i++; } break;
        case 'h':
          if (i > 0 && !isVowel(prev)) i++;
          else if (isVowel(next)) { out += 'h'; i++; }
          else i++; break;
        case 'k': out += (prev === 'c') ? '' : 'k'; i++; break;
        case 'p':
          if (next === 'h') { out += 'f'; i += 2; }
          else { out += 'p'; i++; } break;
        case 'q': out += 'k'; i++; break;
        case 's':
          if (next === 'h') { out += 'x'; i += 2; }
          else if ('io'.includes(next) && ch(i + 2) === 'n') { out += 'x'; i += 3; }
          else { out += 's'; i += (next === 's' ? 2 : 1); } break;
        case 't':
          if (next === 'h') { out += '0'; i += 2; }
          else if (next === 'i' && ch(i + 2) === 'o') { out += 'x'; i += 3; }
          else { out += 't'; i++; } break;
        case 'v': out += 'f'; i++; break;
        case 'w': case 'y':
          if (isVowel(next)) { out += c; }
          i++; break;
        case 'x': out += 'ks'; i++; break;
        case 'z': out += 's'; i++; break;
        default: out += c; i++; break;
      }
    }
    return out.toUpperCase().replace(/[AEIOUY]/g, '').slice(0, 6);
  }
  function metaphoneSim(a, b) {
    const ma = metaphone(a), mb = metaphone(b);
    if (!ma || !mb) return 0;
    if (ma === mb) return 1;
    let p = 0;
    while (p < ma.length && p < mb.length && ma[p] === mb[p]) p++;
    return p / Math.max(ma.length, mb.length);
  }

  // ─── Damerau-Levenshtein (with transposition) ──────────────────────
  // Returns NORMALIZED score in [0..1] (1 = identical).
  function damerau(a, b) {
    const sa = normalize(a), sb = normalize(b);
    if (!sa.length && !sb.length) return 1;
    if (!sa.length || !sb.length) return 0;
    if (sa === sb) return 1;
    const m = sa.length, n = sb.length;
    // bound length explosion — bail to lighter signal on huge strings
    if (m > 64 || n > 64) return 0;
    const d = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
    for (let i = 0; i <= m; i++) d[i][0] = i;
    for (let j = 0; j <= n; j++) d[0][j] = j;
    for (let i = 1; i <= m; i++) {
      for (let j = 1; j <= n; j++) {
        const cost = sa[i - 1] === sb[j - 1] ? 0 : 1;
        d[i][j] = Math.min(
          d[i - 1][j] + 1,         // deletion
          d[i][j - 1] + 1,         // insertion
          d[i - 1][j - 1] + cost,  // substitution
        );
        if (i > 1 && j > 1 &&
            sa[i - 1] === sb[j - 2] && sa[i - 2] === sb[j - 1]) {
          d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + 1); // transposition
        }
      }
    }
    return 1 - d[m][n] / Math.max(m, n);
  }

  // ─── Smith-Waterman local alignment ────────────────────────────────
  // For long titles with subtitles ("Despacito" vs "Despacito (Remix)").
  // Returns score normalized by length of shorter input.
  function smithWaterman(a, b) {
    const sa = normalize(a), sb = normalize(b);
    if (!sa || !sb) return 0;
    if (sa === sb) return 1;
    const m = sa.length, n = sb.length;
    if (m > 80 || n > 80) return 0; // bail
    const MATCH = 2, MISMATCH = -1, GAP = -1;
    let H = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
    let maxScore = 0;
    for (let i = 1; i <= m; i++) {
      for (let j = 1; j <= n; j++) {
        const s = sa[i - 1] === sb[j - 1] ? MATCH : MISMATCH;
        const v = Math.max(
          0,
          H[i - 1][j - 1] + s,
          H[i - 1][j] + GAP,
          H[i][j - 1] + GAP,
        );
        H[i][j] = v;
        if (v > maxScore) maxScore = v;
      }
    }
    // Normalize to 0..1 (perfect alignment of shorter string = MATCH * len)
    return Math.min(1, maxScore / (Math.min(m, n) * MATCH));
  }

  // ─── Soft-TFIDF ────────────────────────────────────────────────────
  // Caller can supply a corpus-DF map (token → docCount). Without one,
  // we fall back to plain Jaccard. With one, rare tokens dominate.
  let DF = null;        // Map<token, docCount>
  let DF_TOTAL_DOCS = 0;
  function setCorpusDF(map, totalDocs) { DF = map; DF_TOTAL_DOCS = totalDocs; }
  function buildCorpusDFFromCatalog() {
    // Auto-build from current catalog. Cheap; runs ~once per session.
    const m = new Map();
    let n = 0;
    const push = (s) => {
      if (!s) return;
      n++;
      const seen = new Set();
      for (const t of tokens(s)) {
        if (seen.has(t)) continue;
        seen.add(t);
        m.set(t, (m.get(t) || 0) + 1);
      }
    };
    (window.WORKS || []).slice(0, 5000).forEach(w => push(w.title || w.name));
    (window.RECORDINGS || []).slice(0, 5000).forEach(r => push(r.title || r.name));
    (window.RELEASES || []).slice(0, 3000).forEach(r => push(r.title || r.name));
    setCorpusDF(m, n);
    return { tokens: m.size, docs: n };
  }
  function idf(token) {
    if (!DF || !DF_TOTAL_DOCS) return 1;
    const df = DF.get(token) || 1;
    return Math.log(1 + DF_TOTAL_DOCS / df);
  }
  function softTfidf(a, b) {
    const A = tokens(a), B = tokens(b);
    if (!A.length || !B.length) return 0;
    if (!DF) return jaccard(a, b); // graceful
    // Token-pairs with similarity >= θ count as "matched"; weight by idf
    const θ = 0.85;
    const matchedA = new Set();
    let num = 0, normA = 0, normB = 0;
    for (const ta of A) {
      let best = 0, bestB = null;
      for (const tb of B) {
        const sim = ta === tb ? 1 : (damerau(ta, tb) * 0.7 + (metaphone(ta) === metaphone(tb) ? 0.3 : 0));
        if (sim > best) { best = sim; bestB = tb; }
      }
      if (best >= θ && bestB) {
        const w = idf(ta) * idf(bestB);
        num += w * best;
        matchedA.add(ta);
      }
      normA += idf(ta) * idf(ta);
    }
    for (const tb of B) normB += idf(tb) * idf(tb);
    return num / Math.sqrt(normA * normB || 1);
  }

  // ─── ID partial / prefix matching ──────────────────────────────────
  function normId(id) {
    return String(id || '').toUpperCase().replace(/[^A-Z0-9]/g, '');
  }
  function idPartialMatch(a, b) {
    const x = normId(a), y = normId(b);
    if (!x || !y) return 0;
    if (x === y) return 1;
    if (y.startsWith(x) && x.length >= 8) return 0.9;
    if (x.startsWith(y) && y.length >= 8) return 0.85;
    if (y.endsWith(x) && x.length >= 8) return 0.8;
    return 0;
  }

  // ─── Composite score ──────────────────────────────────────────────
  // Take the MAX across algorithms (each scaled). For small strings the
  // edit-distance signal is reliable; for long ones the local alignment
  // and trigram cosine pull weight; for misspellings, metaphone.
  function score(a, b) {
    if (!a || !b) return 0;
    const j = jaccard(a, b);
    const t = trigramCos(a, b);
    const p = metaphoneSim(a, b);
    const d = damerau(a, b);
    const sw = smithWaterman(a, b);
    const tf = DF ? softTfidf(a, b) : 0;
    return Math.max(j, t * 0.95, p * 0.85, d * 0.95, sw * 0.9, tf * 0.95);
  }
  function scoreBreakdown(a, b) {
    return {
      jaccard: jaccard(a, b),
      trigram: trigramCos(a, b),
      metaphone: metaphoneSim(a, b),
      damerau: damerau(a, b),
      smithWaterman: smithWaterman(a, b),
      softTfidf: DF ? softTfidf(a, b) : null,
      composite: score(a, b),
    };
  }

  // ─── Blocking key ─────────────────────────────────────────────────
  // Used to keep dedup tractable on a 50k-row catalog. Two records are
  // candidates iff they share at least one blocking key.
  function blockingKeys(s) {
    const out = new Set();
    const m = metaphone(s);
    if (m && m.length >= 3) out.add('m:' + m.slice(0, 4));
    const n = normalize(s);
    if (n.length >= 3) {
      out.add('t:' + n.slice(0, 3));
      // first significant token (length ≥ 4) as a backup key
      for (const tok of n.split(' ')) {
        if (tok.length >= 4) { out.add('w:' + tok.slice(0, 6)); break; }
      }
    }
    return out;
  }

  // ─── Rank query against list ──────────────────────────────────────
  // getStr(item) → string to compare. Returns sorted hits with score ≥ floor.
  function rank(query, list, getStr, opts = {}) {
    const floor = opts.floor ?? 0.3;
    const limit = opts.limit ?? 20;
    if (!query) return [];
    const out = [];
    for (const item of list) {
      const s = getStr(item);
      if (!s) continue;
      const sc = score(query, s);
      if (sc >= floor) out.push({ item, score: sc, label: s });
    }
    return out.sort((a, b) => b.score - a.score).slice(0, limit);
  }

  // ─── Find duplicate pairs in a list ───────────────────────────────
  // Uses blocking + scoring to keep N² in check. Returns [{a, b, score, breakdown}].
  function findDuplicates(items, getStr, opts = {}) {
    const threshold = opts.threshold ?? 0.85;
    const maxPairs = opts.maxPairs ?? 500;
    // 1. block items by blocking keys
    const blocks = new Map(); // key → [item idx]
    items.forEach((it, idx) => {
      const s = getStr(it);
      if (!s) return;
      for (const k of blockingKeys(s)) {
        if (!blocks.has(k)) blocks.set(k, []);
        blocks.get(k).push(idx);
      }
    });
    // 2. enumerate pairs only within blocks; dedup via Set of pair keys
    const seen = new Set();
    const out = [];
    for (const arr of blocks.values()) {
      if (arr.length < 2) continue;
      for (let i = 0; i < arr.length; i++) {
        for (let j = i + 1; j < arr.length; j++) {
          const ai = arr[i], bi = arr[j];
          const key = ai < bi ? ai + ':' + bi : bi + ':' + ai;
          if (seen.has(key)) continue;
          seen.add(key);
          const a = items[ai], b = items[bi];
          const sa = getStr(a), sb = getStr(b);
          const sc = score(sa, sb);
          if (sc >= threshold) {
            out.push({ a, b, aIdx: ai, bIdx: bi, score: sc });
            if (out.length >= maxPairs * 2) break;
          }
        }
        if (out.length >= maxPairs * 2) break;
      }
      if (out.length >= maxPairs * 2) break;
    }
    out.sort((x, y) => y.score - x.score);
    return out.slice(0, maxPairs);
  }

  Object.assign(window, {
    FuzzyEngine: {
      // primitives
      normalize, tokens,
      jaccard, trigramCos, metaphone, metaphoneSim,
      damerau, smithWaterman, softTfidf,
      // composite
      score, scoreBreakdown,
      // ids
      normId, idPartialMatch,
      // blocking & ranking
      blockingKeys, rank, findDuplicates,
      // corpus
      setCorpusDF, buildCorpusDFFromCatalog,
    },
  });
})();
