"""
KNN algorithm for structural and image similarity computation.
Uses numpy for fast Euclidean distance computation.
"""
import numpy as np


def build_onehot_matrix(records, cat_cols):
    """
    Build one-hot binary feature matrix from list of dicts.

    Args:
        records: list of dicts, each with keys from cat_cols
        cat_cols: list of column names to encode

    Returns:
        (matrix, feature_names, record_indices)
        - matrix: np.ndarray of shape (n_records, n_features)
        - feature_names: list of feature name strings
        - record_indices: list mapping matrix row -> original record index
    """
    # Build vocabulary per column
    vocab = {}
    for col in cat_cols:
        values = set()
        for rec in records:
            val = rec.get(col)
            if val and str(val).strip():
                values.add(str(val).strip().lower())
        if values:
            vocab[col] = sorted(values)

    # Build feature name list
    feature_names = []
    for col in cat_cols:
        if col in vocab:
            for val in vocab[col]:
                feature_names.append(f'{col}:{val}')

    if not feature_names:
        return np.zeros((len(records), 1)), ['no_features'], list(range(len(records)))

    # Build matrix
    n = len(records)
    m = len(feature_names)
    matrix = np.zeros((n, m), dtype=np.float32)

    # Build feature index map
    feat_idx = {name: i for i, name in enumerate(feature_names)}

    for row_idx, rec in enumerate(records):
        for col in cat_cols:
            if col in vocab:
                val = rec.get(col)
                if val and str(val).strip():
                    key = f'{col}:{str(val).strip().lower()}'
                    if key in feat_idx:
                        matrix[row_idx, feat_idx[key]] = 1.0

    return matrix, feature_names, list(range(n))


def knn_search(query_idx, matrix, k=10, city_mask=None):
    """
    Find k nearest neighbors for query_idx in matrix using Euclidean distance.

    Args:
        query_idx: index of the query record in matrix
        matrix: np.ndarray of shape (n, d)
        k: number of neighbors to return
        city_mask: optional boolean array of shape (n,); if provided, only consider
                   records where city_mask is True

    Returns:
        list of (neighbor_idx, distance) sorted by distance ASC (excluding query itself)
    """
    query_vec = matrix[query_idx]

    if city_mask is not None:
        candidate_indices = np.where(city_mask)[0]
        # Remove query itself
        candidate_indices = candidate_indices[candidate_indices != query_idx]
        if len(candidate_indices) == 0:
            return []
        candidate_matrix = matrix[candidate_indices]
    else:
        # All rows except query
        all_indices = np.arange(matrix.shape[0])
        candidate_indices = all_indices[all_indices != query_idx]
        candidate_matrix = matrix[candidate_indices]

    # Compute Euclidean distances
    diff = candidate_matrix - query_vec
    distances = np.sqrt(np.sum(diff ** 2, axis=1))

    # Get top k
    k_actual = min(k, len(candidate_indices))
    top_k_local = np.argpartition(distances, k_actual - 1)[:k_actual]
    top_k_local = top_k_local[np.argsort(distances[top_k_local])]

    results = []
    for local_idx in top_k_local:
        orig_idx = int(candidate_indices[local_idx])
        dist = float(distances[local_idx])
        results.append((orig_idx, dist))

    return results


def distance_to_score(distance, max_distance=10.0):
    """Convert Euclidean distance to similarity score in [0, 1]."""
    if max_distance <= 0:
        return 1.0
    score = max(0.0, 1.0 - distance / max_distance)
    return score