// matcher-engine.jsx — intelligent income-line matching engine
//
// Replaces the inline autoMatch() in reconciliation-queue.jsx with a richer
// strategy stack. Same return shape, expanded internals.
//
// New strategies layered on top of ISRC / UPC / learned-rule / title-fuzzy:
//   • Mojibake-aware normalization (titles like "T√∫" still match "Tú")
//   • Double Metaphone phonetic backstop (English + Romance fallback)
//   • Trigram cosine for catalog where token-overlap fails
//   • ISWC / IPI partial / prefix-truncation matching
//   • Fractional / multi-rights-holder split candidates (PRO statements)
//   • Cluster-pass: looks at OTHER unmatched lines from the same statement
//     to propose batch-level matches (same artist+album block).
//   • Confidence calibration: scores normalized into observed buckets per
//     source so you can tune the auto-apply threshold.
//
// EXPORTS: window.MatcherEngine = {
//   matchOne(line, ctx),           // → {status, confidence, candidates, top}
//   matchBatch(lines, ctx),        // → array of results, with cluster boost
//   makeSignature(line),           // (source × title-prefix × territory)
//   metaphone(s), trigramCos(a,b), // primitives, exposed for tests
//   normalize(s),                  // mojibake-aware + ascii-fold + token strip
//   calibrate(results)             // → histogram + suggested per-source threshold
// }

(() => {
  // ── primitives ────────────────────────────────────────────────────

  // Mojibake-aware normalize: if MojibakeEngine has classified this string
  // as corrupted, decode FIRST, then ascii-fold, then token-strip.
  function normalize(s) {
    if (!s) return '';
    let v = String(s);
    if (window.MojibakeEngine) {
      const det = window.MojibakeEngine.detect(v);
      if (det && det.corrupted) {
        const dec = window.MojibakeEngine.decode(v);
        if (dec && dec.changed && dec.confidence >= 0.6) v = dec.output;
      }
    }
    // ascii-fold (NFD + strip combining marks)
    v = v.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
    return v.toLowerCase().replace(/[^a-z0-9]+/g, ' ').trim();
  }

  function tokens(s) { return normalize(s).split(' ').filter(Boolean); }

  // Token Jaccard
  function jaccard(a, b) {
    const A = new Set(tokens(a));
    const B = new Set(tokens(b));
    if (!A.size || !B.size) return 0;
    let inter = 0;
    A.forEach(t => B.has(t) && inter++);
    return inter / new Set([...A, ...B]).size;
  }

  // ── Double Metaphone (compact implementation) ─────────────────────
  // Lawrence Philips' algorithm; we ship a slightly simplified single-key
  // version that's good enough for music-catalog name matching. Encodes
  // surnames and song titles into a phonetic key so "Beyonce" ≈ "Beyoncé"
  // ≈ "Beyonse" all collapse to "PYNS".
  function metaphone(s) {
    if (!s) return '';
    s = normalize(s).replace(/\s+/g, '');
    if (!s) return '';
    let out = '';
    let i = 0;
    const ch = (k) => s[k] || '';
    const isVowel = (c) => 'aeiouy'.includes(c);
    // skip silent leading pairs
    if (/^(gn|kn|pn|wr|ps)/.test(s)) i = 1;
    if (s.startsWith('x')) { out += 's'; i = 1; }

    while (i < s.length && out.length < 8) {
      const c = ch(i);
      const next = ch(i + 1);
      const prev = ch(i - 1);
      switch (c) {
        case 'a': case 'e': case 'i': case 'o': case 'u':
          if (i === 0) out += 'a'; // keep leading vowel
          i++; break;
        case 'b': out += 'p'; i += (next === 'b') ? 2 : 1; break;
        case 'c':
          if (next === 'h') { out += (s.substr(i + 2, 2) === 'ar') ? 'k' : 'x'; i += 2; }
          else if ('iey'.includes(next)) { out += 's'; i += 1; }
          else { out += 'k'; i += (next === 'c' ? 2 : 1); }
          break;
        case 'd':
          if (next === 'g' && 'iey'.includes(ch(i + 2))) { out += 'j'; i += 3; }
          else { out += 't'; i += (next === 'd' ? 2 : 1); }
          break;
        case 'g':
          if (next === 'h') { i += 2; if (!isVowel(ch(i))) {} }
          else if (next === 'n') { out += 'n'; i += 2; }
          else if ('iey'.includes(next)) { out += 'j'; i++; }
          else { out += 'k'; i++; }
          break;
        case 'h':
          if (i > 0 && !isVowel(prev)) i++;
          else if (isVowel(next)) { out += 'h'; i++; }
          else i++;
          break;
        case 'k': out += (prev === 'c') ? '' : 'k'; i++; break;
        case 'p':
          if (next === 'h') { out += 'f'; i += 2; }
          else { out += 'p'; i++; }
          break;
        case 'q': out += 'k'; i++; break;
        case 's':
          if (next === 'h') { out += 'x'; i += 2; }
          else if ('io'.includes(next) && ch(i + 2) === 'n') { out += 'x'; i += 3; }
          else { out += 's'; i += (next === 's' ? 2 : 1); }
          break;
        case 't':
          if (next === 'h') { out += '0'; i += 2; }
          else if (next === 'i' && ch(i + 2) === 'o') { out += 'x'; i += 3; }
          else { out += 't'; i++; }
          break;
        case 'v': out += 'f'; i++; break;
        case 'w': case 'y':
          if (isVowel(next)) { out += c; }
          i++; break;
        case 'x': out += 'ks'; i++; break;
        case 'z': out += 's'; i++; break;
        default: out += c; i++; break;
      }
    }
    return out.toUpperCase().replace(/[AEIOUY]/g, '').slice(0, 6);
  }

  function metaphoneSim(a, b) {
    const ma = metaphone(a), mb = metaphone(b);
    if (!ma || !mb) return 0;
    if (ma === mb) return 1;
    // Prefix overlap
    let p = 0;
    while (p < ma.length && p < mb.length && ma[p] === mb[p]) p++;
    return p / Math.max(ma.length, mb.length);
  }

  // ── Trigram cosine ────────────────────────────────────────────────
  function trigrams(s) {
    const t = '  ' + normalize(s).replace(/ /g, ' ') + '  ';
    const m = new Map();
    for (let i = 0; i < t.length - 2; i++) {
      const g = t.slice(i, i + 3);
      m.set(g, (m.get(g) || 0) + 1);
    }
    return m;
  }
  function trigramCos(a, b) {
    const A = trigrams(a), B = trigrams(b);
    if (!A.size || !B.size) return 0;
    let dot = 0, nA = 0, nB = 0;
    for (const [g, v] of A) {
      nA += v * v;
      if (B.has(g)) dot += v * B.get(g);
    }
    for (const v of B.values()) nB += v * v;
    return dot / Math.sqrt(nA * nB || 1);
  }

  // Combined title score: max of jaccard, trigram, metaphone (each
  // re-scaled into the same band so the max is meaningful).
  function titleScore(a, b) {
    const j = jaccard(a, b);
    const t = trigramCos(a, b);
    const p = metaphoneSim(a, b);
    return Math.max(j, t * 0.95, p * 0.85);
  }

  // ── ID partial / prefix matching ──────────────────────────────────
  // Truncated ISWC / IPI codes show up in PRO statements; build prefix
  // index and match if the line's truncated id is a prefix of a real one.
  function normId(id) {
    return String(id || '').toUpperCase().replace(/[^A-Z0-9]/g, '');
  }
  function idPartialMatch(idLine, idCatalog) {
    const a = normId(idLine), b = normId(idCatalog);
    if (!a || !b) return 0;
    if (a === b) return 1;
    if (b.startsWith(a) && a.length >= 8) return 0.9;
    if (a.startsWith(b) && b.length >= 8) return 0.85;
    // suffix-of (some sources strip the leading T/W)
    if (b.endsWith(a) && a.length >= 8) return 0.8;
    return 0;
  }

  // ── signature for learning rules ──────────────────────────────────
  function makeSignature(line) {
    const t = normalize(line.trackTitle || line.workTitle || line.title || '').slice(0, 32);
    const src = (line.sourceKind || line.source || line.stmtSource || '').toLowerCase();
    return `${src}|${t}|${line.territory || ''}`;
  }

  // ── core matcher ──────────────────────────────────────────────────
  function matchOne(line, ctx) {
    const { recordings = [], works = [], rules = [] } = ctx;
    const cands = [];

    // 1) ISRC exact
    if (line.isrc) {
      const r = recordings.find(x => normId(x.isrc) === normId(line.isrc));
      if (r) cands.push({ kind:'recording', id:r.id, label:r.title || r.name, conf:0.99, reason:`ISRC ${line.isrc} exact`, ref:r, strategy:'isrc' });
    }
    // 1b) ISRC prefix (truncated by 1-2 chars)
    if (line.isrc && cands.length === 0) {
      for (const r of recordings) {
        const p = idPartialMatch(line.isrc, r.isrc);
        if (p >= 0.8) { cands.push({ kind:'recording', id:r.id, label:r.title||r.name, conf: 0.7 + p*0.15, reason:`ISRC ${line.isrc} ≈ ${r.isrc} (truncated)`, ref:r, strategy:'isrc-partial' }); break; }
      }
    }
    // 2) UPC → release
    if (line.upc) {
      const releases = window.RELEASES || [];
      const rel = releases.find(x => (x.upc||'').replace(/\D/g,'') === line.upc.replace(/\D/g,''));
      if (rel) cands.push({ kind:'release', id:rel.id, label:rel.title || rel.name, conf:0.92, reason:`UPC ${line.upc} matches release "${rel.title||rel.name}"`, ref:rel, strategy:'upc' });
    }
    // 2b) ISWC exact / partial → work
    if (line.iswc) {
      for (const w of works) {
        const p = idPartialMatch(line.iswc, w.iswc);
        if (p >= 0.85) { cands.push({ kind:'work', id:w.id, label:w.title || w.name, conf: 0.65 + p*0.32, reason: p === 1 ? `ISWC ${line.iswc} exact` : `ISWC ${line.iswc} ≈ ${w.iswc} (partial)`, ref:w, strategy:'iswc' }); break; }
      }
    }

    // 3) Learning rule
    const sig = makeSignature(line);
    const ruleHit = rules.find(r => r.signature === sig);
    if (ruleHit) {
      cands.push({ kind:ruleHit.kind, id:ruleHit.id, label:ruleHit.label, conf:0.95, reason:`Learned rule · same source/title pattern matched ${ruleHit.label} ${ruleHit.appliedCount}× before`, learned:true, strategy:'learned' });
    }

    // 4) Title fuzzy → works (with phonetic + trigram backstop)
    const titleField = line.trackTitle || line.workTitle || line.title;
    if (titleField) {
      let best = null;
      for (const w of works) {
        const s = titleScore(titleField, w.title || w.name);
        if (!best || s > best.s) best = { s, w };
      }
      if (best && best.s >= 0.55) {
        cands.push({
          kind:'work', id:best.w.id, label:best.w.title || best.w.name,
          conf: 0.45 + best.s * 0.45,
          reason: `Title match → "${best.w.title || best.w.name}" (combined ${(best.s*100).toFixed(0)}%)`,
          ref: best.w, strategy: 'title-fuzzy',
        });
      }
    }

    // 5) Artist + title → recording
    if (titleField && (line.trackArtists || line.releaseArtists)) {
      const artist = line.trackArtists || line.releaseArtists;
      let best = null;
      for (const r of recordings) {
        const ts = titleScore(titleField, r.title || r.name);
        const as = titleScore(artist, r.primaryArtistName || r.artist || '');
        const combined = ts * 0.65 + as * 0.35;
        if (!best || combined > best.s) best = { s: combined, r, ts, as };
      }
      if (best && best.s >= 0.5) {
        cands.push({
          kind:'recording', id:best.r.id, label:best.r.title || best.r.name,
          conf: 0.4 + best.s * 0.45,
          reason: `Artist + title combo · "${titleField}" by ${artist} ↔ "${best.r.title||best.r.name}" (T ${(best.ts*100).toFixed(0)} A ${(best.as*100).toFixed(0)})`,
          ref: best.r, strategy: 'artist-title',
        });
      }
    }

    // 6) Fractional / multi-holder split candidates
    // PRO statements often pay a *share* of a work. If the matched work has
    // multiple writers/publishers, surface fractional candidates so the user
    // can map this line to a specific share.
    const topWorkCand = cands.find(c => c.kind === 'work');
    if (topWorkCand && topWorkCand.ref) {
      const w = topWorkCand.ref;
      const splits = w.splits || w.writers || w.shares || [];
      if (Array.isArray(splits) && splits.length >= 2) {
        for (const s of splits.slice(0, 3)) {
          const lbl = s.partyName || s.name || s.party || s.role || 'split';
          const pct = s.share || s.pct || s.percent || 0;
          if (pct > 0) {
            cands.push({
              kind:'work-share', id: `${w.id}::${lbl}`, label: `${w.title || w.name} · ${lbl} (${pct.toFixed ? pct.toFixed(2) : pct}%)`,
              conf: (topWorkCand.conf || 0.7) - 0.1,
              reason: `Fractional split · ${lbl} holds ${pct}% of "${w.title || w.name}"`,
              ref: { work: w, split: s }, strategy: 'fractional',
            });
          }
        }
      }
    }

    // Dedupe + sort
    const dedup = new Map();
    for (const c of cands) {
      const k = c.kind + ':' + c.id;
      if (!dedup.has(k) || dedup.get(k).conf < c.conf) dedup.set(k, c);
    }
    const sorted = [...dedup.values()].sort((a, b) => b.conf - a.conf);
    const top = sorted[0];

    // Threshold: source-aware if calibration is loaded
    const tuned = (ctx.thresholds && (ctx.thresholds[line.stmtSourceKind] || ctx.thresholds[line.sourceKind])) || null;
    const acceptT = tuned?.acceptAt || 0.85;
    const reviewT = tuned?.reviewAt || 0.55;

    let status = 'unmatched';
    if (top) {
      if (top.conf >= acceptT) status = 'matched';
      else if (top.conf >= reviewT) status = 'uncertain';
    }
    return { status, confidence: top?.conf || 0, candidates: sorted, top };
  }

  // ── batch with cluster-pass ───────────────────────────────────────
  // After per-line matching, look at unmatched lines that share an
  // (artist, release) signature. If ≥3 lines from the same statement
  // resolve to the same album, propose the album to all of them as a
  // boosted cluster suggestion.
  function matchBatch(lines, ctx) {
    const results = lines.map(ln => ({ line: ln, ...matchOne(ln, ctx) }));

    // Group unmatched/uncertain lines by (statement, artist)
    const groups = new Map();
    for (const r of results) {
      if (r.status === 'matched') continue;
      const ln = r.line;
      const artist = normalize(ln.trackArtists || ln.releaseArtists || '');
      const stmt = ln.stmtId || '';
      if (!artist) continue;
      const k = stmt + '|' + artist.slice(0, 30);
      if (!groups.has(k)) groups.set(k, []);
      groups.get(k).push(r);
    }

    for (const [k, group] of groups) {
      if (group.length < 3) continue;
      // Find a release that is the most-common candidate target for these lines
      const releases = window.RELEASES || [];
      const artist = group[0].line.trackArtists || group[0].line.releaseArtists;
      let bestRel = null, bestSc = 0;
      for (const rel of releases.slice(0, 800)) {
        const as = titleScore(artist, rel.primaryArtistName || rel.artist || '');
        if (as < 0.55) continue;
        let nMatches = 0;
        for (const r of group) {
          const ts = titleScore(r.line.trackTitle || r.line.title || '', rel.title || rel.name);
          if (ts >= 0.4) nMatches++;
        }
        const sc = (nMatches / group.length) * 0.6 + as * 0.4;
        if (sc > bestSc) { bestSc = sc; bestRel = rel; }
      }
      if (bestRel && bestSc >= 0.55) {
        // Inject a cluster candidate into every member of the group
        for (const r of group) {
          const clusterCand = {
            kind:'release', id:bestRel.id, label: bestRel.title || bestRel.name,
            conf: Math.min(0.92, 0.65 + bestSc * 0.25),
            reason: `Cluster · ${group.length} unmatched lines from this statement share artist "${artist}" — proposing album "${bestRel.title || bestRel.name}"`,
            ref: bestRel, strategy: 'cluster',
          };
          r.candidates = [clusterCand, ...(r.candidates || [])].sort((a, b) => b.conf - a.conf);
          r.top = r.candidates[0];
          r.confidence = r.top.conf;
          if (r.confidence >= (ctx.thresholds?.[r.line.stmtSourceKind]?.acceptAt || 0.85)) r.status = 'matched';
          else if (r.confidence >= 0.55) r.status = 'uncertain';
          r.cluster = { artist, size: group.length, release: bestRel.id };
        }
      }
    }

    return results;
  }

  // ── calibration ───────────────────────────────────────────────────
  // Build a histogram of confidence scores, per source. Recommend an
  // accept-threshold at the inflection where the precision (= % of
  // manual-confirmed-correct in that bucket) drops below 0.9.
  function calibrate(results) {
    // results: {line, status, confidence, top, manual? }
    const bySource = new Map();
    for (const r of results) {
      const src = r.line.stmtSource || r.line.sourceKind || 'unknown';
      if (!bySource.has(src)) bySource.set(src, []);
      bySource.get(src).push(r);
    }
    const out = [];
    for (const [src, rs] of bySource) {
      // Histogram in 10 buckets [0,1)
      const hist = new Array(10).fill(0);
      const matched = new Array(10).fill(0);
      for (const r of rs) {
        const b = Math.min(9, Math.floor((r.confidence || 0) * 10));
        hist[b]++;
        if (r.status === 'matched' || r.manual) matched[b]++;
      }
      // Find lowest bucket where matched/hist >= 0.9 holds for everything above
      let suggestedAcceptAt = 0.85;
      for (let i = hist.length - 1; i >= 5; i--) {
        const ratio = hist[i] ? matched[i] / hist[i] : 1;
        if (ratio < 0.9) { suggestedAcceptAt = (i + 1) / 10; break; }
      }
      out.push({
        source: src,
        n: rs.length,
        hist,
        matched,
        avg: rs.reduce((s, r) => s + (r.confidence || 0), 0) / (rs.length || 1),
        suggestedAcceptAt,
      });
    }
    return out.sort((a, b) => b.n - a.n);
  }

  Object.assign(window, {
    MatcherEngine: {
      matchOne, matchBatch,
      makeSignature, normalize, metaphone, trigramCos,
      titleScore, jaccard, metaphoneSim,
      idPartialMatch, calibrate,
    },
  });
})();
