feat: audio upload + AI-assisted tempo map generation

Users can now upload any audio file to generate a CTP tempo map:

BPM detection (lib/analysis/bpm-detect.ts):
- Runs entirely client-side via Web Audio API — audio is never uploaded
- Decodes any browser-supported format (MP3, WAV, AAC, OGG, FLAC, M4A)
- Energy envelope → onset strength → autocorrelation over 55–210 BPM range
- Returns BPM, normalised confidence score, duration, and optional half-time BPM
  for songs where a double-time pulse is detected

AI CTP generation (lib/analysis/ai-ctp.ts):
- Calls Claude (claude-opus-4-6) with adaptive thinking + structured JSON output
- System prompt explains CTP rules and section layout conventions
- Claude uses knowledge of well-known songs to produce accurate section maps;
  falls back to a sensible generic structure for unknown tracks
- Only BPM + duration + optional metadata is sent to the server (no audio data)

API route (app/api/analyze/route.ts):
- POST /api/analyze accepts { bpm, duration, title?, artist?, mbid?, contributed_by? }
- Validates input, calls generateCTPWithAI, runs CTP schema validation
- Returns { ctp, warnings } — warnings are surfaced in the UI rather than 500-ing

UI (components/TempoAnalyzer.tsx, app/(web)/analyze/page.tsx):
- Drag-and-drop or browse file upload
- Shows BPM, confidence, duration after detection
- Half-time toggle when double-time is detected
- Metadata form: title, artist, MusicBrainz ID, contributor name
  (filename parsed into artist/title as a convenience default)
- AI generation with streaming-style progress states
- Sections review via TempoMapEditor
- Download .ctp.json or submit directly to the database

Also: added @anthropic-ai/sdk to package.json, ANTHROPIC_API_KEY to .env.example,
updated next.config.mjs serverComponentsExternalPackages, added Analyze nav link.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
AJ Avezzano
2026-04-01 11:43:14 -04:00
parent 331a5fbfca
commit 51f67f0aeb
10 changed files with 1023 additions and 29 deletions

179
lib/analysis/ai-ctp.ts Normal file
View File

@@ -0,0 +1,179 @@
/**
* AI-assisted CTP document generation
*
* Takes the results of BPM detection (and optional song metadata) and uses
* Claude to produce a plausible, well-structured CTP document.
*
* Claude is asked to:
* - Divide the song into typical sections (Intro, Verse, Chorus, Bridge…)
* - Assign realistic start bars for each section
* - Note any tempo changes it would expect for the song/genre
* - Return a fully valid CTP 1.0 JSON document
*
* The caller should treat the result as a *draft* — the generated sections
* are educated guesses and should be verified against the recording.
*/
import Anthropic from "@anthropic-ai/sdk";
import type { CTPDocument } from "@/lib/ctp/schema";
const client = new Anthropic();
// ─── Input / output types ─────────────────────────────────────────────────────
export interface AnalysisInput {
bpm: number;
duration: number; // seconds
title?: string;
artist?: string;
mbid?: string | null;
contributedBy?: string;
}
// ─── JSON Schema for structured output ───────────────────────────────────────
// Must be strict (no additionalProperties, all required fields present).
const CTP_SCHEMA = {
type: "object",
additionalProperties: false,
required: ["version", "metadata", "count_in", "sections"],
properties: {
version: { type: "string", enum: ["1.0"] },
metadata: {
type: "object",
additionalProperties: false,
required: [
"title", "artist", "mbid", "duration_seconds",
"contributed_by", "verified", "created_at",
],
properties: {
title: { type: "string" },
artist: { type: "string" },
mbid: { type: ["string", "null"] },
duration_seconds: { type: "number" },
contributed_by: { type: "string" },
verified: { type: "boolean" },
created_at: { type: "string" },
},
},
count_in: {
type: "object",
additionalProperties: false,
required: ["enabled", "bars", "use_first_section_tempo"],
properties: {
enabled: { type: "boolean" },
bars: { type: "integer", minimum: 1, maximum: 8 },
use_first_section_tempo: { type: "boolean" },
},
},
sections: {
type: "array",
minItems: 1,
items: {
type: "object",
additionalProperties: false,
required: ["label", "start_bar", "time_signature", "transition"],
// bpm is required for step, bpm_start/bpm_end for ramp — handled via oneOf
// but we keep this schema simple (strict mode) and validate downstream with Zod.
properties: {
label: { type: "string" },
start_bar: { type: "integer", minimum: 1 },
bpm: { type: "number" },
bpm_start: { type: "number" },
bpm_end: { type: "number" },
transition: { type: "string", enum: ["step", "ramp"] },
time_signature: {
type: "object",
additionalProperties: false,
required: ["numerator", "denominator"],
properties: {
numerator: { type: "integer", minimum: 1, maximum: 32 },
denominator: { type: "integer", enum: [1, 2, 4, 8, 16, 32] },
},
},
},
},
},
},
};
// ─── System prompt ────────────────────────────────────────────────────────────
const SYSTEM_PROMPT = `\
You are an expert music producer and session musician assisting cover bands with click tracks.
You will receive automated BPM detection results for a song and must generate a CTP (Click Track Protocol) document describing the song's full tempo map.
CTP rules:
- "version" must be "1.0"
- sections[0].start_bar must be 1
- sections must be sorted by start_bar ascending, with no gaps
- Step sections have a single "bpm" field; ramp sections have "bpm_start" and "bpm_end" (no "bpm" field)
- All BPM values must be between 20 and 400
- time_signature.denominator must be a power of 2 (1, 2, 4, 8, 16, or 32)
- metadata.verified must be false (this is AI-generated, not human-verified)
- metadata.created_at must be an ISO 8601 datetime string
Guidelines for section layout:
- Use typical pop/rock section names: Intro, Verse, Pre-Chorus, Chorus, Bridge, Outro
- Estimate bar counts based on song duration and BPM (bars = duration_seconds × BPM / 60 / beats_per_bar)
- Most songs are 4/4; note any unusual meters if you know the song
- If you know the song has a tempo change (ritardando, double-time feel, key change with tempo shift), model it with a ramp or step section
- If unsure about sections, use a single constant-tempo section covering the whole song
- Use the detected BPM as the primary tempo — do not invent a different BPM unless the song is well-known to have a different tempo
The output is a draft for human review. Add reasonable section structure based on the song's typical arrangement.`;
// ─── Main function ────────────────────────────────────────────────────────────
export async function generateCTPWithAI(input: AnalysisInput): Promise<CTPDocument> {
const { bpm, duration, title, artist, mbid, contributedBy } = input;
const approxBars = Math.round((duration * bpm) / 60 / 4); // assuming 4/4
const userMessage = `\
Generate a CTP document for the following song:
Title: ${title ?? "Unknown Title"}
Artist: ${artist ?? "Unknown Artist"}
MusicBrainz ID: ${mbid ?? "unknown"}
Detected BPM: ${bpm}
Duration: ${duration.toFixed(1)} seconds (~${approxBars} bars at 4/4)
Contributed by: ${contributedBy ?? "anonymous"}
Create a plausible section layout for this song. If this is a well-known song, use your knowledge of its actual arrangement. If not, use a sensible generic structure.`;
const response = await client.messages.create({
model: "claude-opus-4-6",
max_tokens: 2048,
thinking: { type: "adaptive" },
system: SYSTEM_PROMPT,
messages: [{ role: "user", content: userMessage }],
output_config: {
format: {
type: "json_schema",
schema: CTP_SCHEMA,
},
},
});
const textBlock = response.content.find((b) => b.type === "text");
if (!textBlock || textBlock.type !== "text") {
throw new Error("Claude did not return a text block");
}
let parsed: unknown;
try {
parsed = JSON.parse(textBlock.text);
} catch {
throw new Error(`Claude returned invalid JSON: ${textBlock.text.slice(0, 200)}`);
}
// Stamp the current timestamp if Claude left a placeholder
const doc = parsed as CTPDocument;
if (!doc.metadata.created_at || doc.metadata.created_at.includes("placeholder")) {
doc.metadata.created_at = new Date().toISOString();
}
return doc;
}

187
lib/analysis/bpm-detect.ts Normal file
View File

@@ -0,0 +1,187 @@
/**
* Client-side BPM detection
*
* Runs entirely in the browser using the Web Audio API (no server round-trip
* for the audio itself). The algorithm:
*
* 1. Decode the audio file into PCM via AudioContext.decodeAudioData()
* 2. Mix to mono, optionally resample to 22050 Hz
* 3. Compute a short-time energy envelope (512-sample frames)
* 4. Derive an onset-strength signal via half-wave-rectified first difference
* 5. Autocorrelate the onset signal over lags corresponding to 55210 BPM
* 6. Pick the lag with the highest correlation; also test its 2× harmonic
* (halving the BPM) as a tiebreaker for double-time detections
*
* Typical accuracy is ±12 BPM on produced music with a clear beat.
* Rubato, live recordings, or highly syncopated rhythms may need manual adjustment.
*/
export interface BPMDetectionResult {
bpm: number;
/** Normalised confidence 01. Values above ~0.4 are generally reliable. */
confidence: number;
/** Total duration of the source file in seconds. */
duration: number;
/** The raw analysis produced a half-time alternative; user may prefer it. */
halfTimeBpm: number | null;
}
// ─── Internal helpers ─────────────────────────────────────────────────────────
function mixToMono(buffer: AudioBuffer): Float32Array {
const n = buffer.length;
if (buffer.numberOfChannels === 1) {
return buffer.getChannelData(0).slice();
}
const mono = new Float32Array(n);
for (let c = 0; c < buffer.numberOfChannels; c++) {
const ch = buffer.getChannelData(c);
for (let i = 0; i < n; i++) mono[i] += ch[i];
}
const scale = 1 / buffer.numberOfChannels;
for (let i = 0; i < n; i++) mono[i] *= scale;
return mono;
}
function energyEnvelope(samples: Float32Array, frameSize: number): Float32Array {
const numFrames = Math.floor(samples.length / frameSize);
const env = new Float32Array(numFrames);
for (let i = 0; i < numFrames; i++) {
let sum = 0;
const base = i * frameSize;
for (let j = 0; j < frameSize; j++) {
const s = samples[base + j];
sum += s * s;
}
env[i] = Math.sqrt(sum / frameSize);
}
return env;
}
/**
* Half-wave-rectified first difference of the energy envelope.
* Positive spikes correspond to onset events (energy increases).
*/
function onsetStrength(env: Float32Array): Float32Array {
const onset = new Float32Array(env.length);
for (let i = 1; i < env.length; i++) {
const diff = env[i] - env[i - 1];
onset[i] = diff > 0 ? diff : 0;
}
return onset;
}
/**
* Normalised autocorrelation at a given lag.
* Returns a value in [-1, 1].
*/
function autocorrAtLag(signal: Float32Array, lag: number): number {
const n = signal.length - lag;
if (n <= 0) return 0;
let sumXX = 0;
let sumYY = 0;
let sumXY = 0;
for (let i = 0; i < n; i++) {
const x = signal[i];
const y = signal[i + lag];
sumXX += x * x;
sumYY += y * y;
sumXY += x * y;
}
const denom = Math.sqrt(sumXX * sumYY);
return denom > 0 ? sumXY / denom : 0;
}
// ─── Public API ───────────────────────────────────────────────────────────────
/**
* Analyses a user-provided audio file and returns the estimated BPM.
* Must be called from a browser environment (requires Web Audio API).
*
* @param file An audio File (MP3, WAV, AAC, OGG — anything the browser decodes)
* @param signal An optional AbortSignal to cancel long analysis
*/
export async function detectBPM(
file: File,
signal?: AbortSignal
): Promise<BPMDetectionResult> {
// Decode at 22050 Hz to reduce computation while keeping enough resolution
const targetSampleRate = 22050;
const audioCtx = new AudioContext({ sampleRate: targetSampleRate });
try {
const arrayBuffer = await file.arrayBuffer();
if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
if (signal?.aborted) throw new DOMException("Aborted", "AbortError");
const duration = audioBuffer.duration;
const sampleRate = audioBuffer.sampleRate; // may differ from targetSampleRate
const mono = mixToMono(audioBuffer);
// Analyse a representative middle segment (skip silent intros/outros).
// Cap at 90 s so analysis stays fast even on long recordings.
const analysisStart = Math.floor(sampleRate * Math.min(10, duration * 0.1));
const analysisEnd = Math.min(
mono.length,
analysisStart + Math.floor(sampleRate * 90)
);
const segment = mono.subarray(analysisStart, analysisEnd);
// Energy envelope: ~23 ms frames at 22050 Hz
const FRAME_SIZE = 512;
const frameRate = sampleRate / FRAME_SIZE; // frames per second
const env = energyEnvelope(segment, FRAME_SIZE);
const onset = onsetStrength(env);
// Lag bounds for 55210 BPM
const minLag = Math.max(1, Math.round((frameRate * 60) / 210));
const maxLag = Math.round((frameRate * 60) / 55);
// Sweep lags and collect correlations
let bestLag = minLag;
let bestCorr = -Infinity;
for (let lag = minLag; lag <= maxLag; lag++) {
const corr = autocorrAtLag(onset, lag);
if (corr > bestCorr) {
bestCorr = corr;
bestLag = lag;
}
}
const rawBpm = (frameRate * 60) / bestLag;
// Round to one decimal place
const bpm = Math.round(rawBpm * 10) / 10;
// Check whether the half-time (bpm/2) has comparable correlation —
// double-time detections are common on songs with a 2-beat pulse.
const halfTimeLag = bestLag * 2;
let halfTimeBpm: number | null = null;
if (halfTimeLag <= maxLag * 2) {
const halfCorr = autocorrAtLag(onset, halfTimeLag);
if (halfCorr > bestCorr * 0.85) {
halfTimeBpm = Math.round((rawBpm / 2) * 10) / 10;
}
}
// Normalise confidence against the best possible correlation in the range
const maxPossibleCorr = Math.max(
...Array.from({ length: maxLag - minLag + 1 }, (_, i) =>
Math.abs(autocorrAtLag(onset, minLag + i))
)
);
const confidence =
maxPossibleCorr > 0
? Math.max(0, Math.min(1, bestCorr / maxPossibleCorr))
: 0;
return { bpm, confidence, duration, halfTimeBpm };
} finally {
await audioCtx.close();
}
}