scd/microsound/wavesets_k-means_clustering/kmeans.py

#!/usr/bin/env python
import numpy as np
import soundfile as sf
from sklearn.cluster import KMeans
from scipy.signal import butter, lfilter

def highpass_filter(audio, sr, cutoff=20.0):
    b, a = butter(1, cutoff / (sr / 2), btype='highpass')
    return lfilter(b, a, audio)

def find_upward_zero_crossings(signal):
    return np.where((signal[:-1] < 0) & (signal[1:] >= 0))[0] + 1

def extract_wavesets(signal):
    zc = find_upward_zero_crossings(signal)
    return [signal[zc[i]:zc[i+1]] for i in range(len(zc) - 1)], zc

def compute_features(wavesets):
    lengths = np.array([len(w) for w in wavesets])
    rms = np.array([np.sqrt(np.mean(w**2)) for w in wavesets])
    return lengths, rms

def normalize_and_weight(lengths, rms, w):
    lengths = (lengths - np.mean(lengths)) / np.std(lengths)
    rms = (rms - np.mean(rms)) / np.std(rms)
    lengths *= w
    return np.stack([lengths, rms], axis=1)

def replace_with_representatives(wavesets, labels, centroids, features):
    reps = []
    for k in range(centroids.shape[0]):
        cluster_indices = np.where(labels == k)[0]
        cluster_features = features[cluster_indices]
        dists = np.linalg.norm(cluster_features - centroids[k], axis=1)
        rep_idx = cluster_indices[np.argmin(dists)]
        reps.append(wavesets[rep_idx])
    return [reps[label] for label in labels]

def reconstruct_signal(replaced_sets, zero_crossings, length):
    output = np.zeros(length)
    cursor = zero_crossings[0]
    for i, ws in enumerate(replaced_sets):
        end = cursor + len(ws)
        if end <= len(output):
            output[cursor:end] = ws
            cursor = end
        else:
            break
    return output

def waveset_clustering_effect(filepath, output_path, w=5, clusters_per_sec=20):
    signal, sr = sf.read(filepath)
    if signal.ndim > 1:
        signal = signal.mean(axis=1)  # Mono

    signal = highpass_filter(signal, sr)
    wavesets, zero_crossings = extract_wavesets(signal)
    lengths, rms = compute_features(wavesets)
    features = normalize_and_weight(lengths, rms, w)

    total_time = len(signal) / sr
    k = int(clusters_per_sec * total_time)
    k = max(2, min(k, len(wavesets)))  # Avoid trivial or impossible cases

    kmeans = KMeans(n_clusters=k, random_state=0).fit(features)
    replaced_sets = replace_with_representatives(wavesets, kmeans.labels_, kmeans.cluster_centers_, features)

    reconstructed = reconstruct_signal(replaced_sets, zero_crossings, len(signal))
    sf.write(output_path, reconstructed, sr)

# Example usage:
waveset_clustering_effect("input.wav", "output.wav", w=5, clusters_per_sec=15)