+
    çj                     ,    R t ^ RItR tRR ltRR ltR# )zt
KNN algorithm for structural and image similarity computation.
Uses numpy for fast Euclidean distance computation.
Nc           	        / pV F  p\        4       pV  Fu  pVP                  V4      pV'       g   K  \        V4      P                  4       '       g   K?  VP	                  \        V4      P                  4       P                  4       4       Kw  	  V'       g   K  \        V4      W#&   K  	  . pV F0  pW29   g   K  W#,           F  pVP                  V RV 24       K  	  K2  	  V'       gA   \        P                  ! \        V 4      ^34      R.\        \        \        V 4      4      4      3# \        V 4      p\        V4      p	\        P                  ! W3\        P                  R7      p
\        V4       UUu/ uF  w  rWbK	  	  ppp\        V 4       F  w  rV F  pW29   g   K  VP                  V4      pV'       g   K&  \        V4      P                  4       '       g   KG  V R\        V4      P                  4       P                  4        2pW9   g   K{  RWW,          3&   K  	  K  	  W\        \        V4      4      3# u uppi )a  
Build one-hot binary feature matrix from list of dicts.

Args:
    records: list of dicts, each with keys from cat_cols
    cat_cols: list of column names to encode

Returns:
    (matrix, feature_names, record_indices)
    - matrix: np.ndarray of shape (n_records, n_features)
    - feature_names: list of feature name strings
    - record_indices: list mapping matrix row -> original record index
:no_features)dtype      ?)setgetstrstripaddlowersortedappendnpzeroslenlistrangefloat32	enumerate)recordscat_colsvocabcolvaluesrecvalfeature_namesnmmatrixinamefeat_idxrow_idxkeys   &&              L/Users/jokubas/Desktop/KTU/big_data/engineering/processing/algorithms/knn.pybuild_onehot_matrixr'      s    EC''#,Css3x~~''

3s8>>+1134  6EJ  M<zz$$uAcU^4 " 
 xxWq)*]OT%GBU=VVV 	GAMAXXqfBJJ/F (1'?@'?GA'?H@!'*C|ggcl33s8>>++ E3s8>>#3#9#9#;"<=C9<56  + $uQx.00 As   3H:c                   W,          pVeD   \         P                  ! V4      ^ ,          pWUV 8g  ,          p\        V4      ^ 8X  d   . # W,          pM:\         P                  ! VP                  ^ ,          4      pWwV 8g  ,          pW,          pWd,
          p\         P
                  ! \         P                  ! V^,          ^R7      4      p	\        V\        V4      4      p
\         P                  ! W^,
          4      RV
 pV\         P                  ! W,          4      ,          p. pV F7  p\        W],          4      p\        W,          4      pVP                  W34       K9  	  V# )a  
Find k nearest neighbors for query_idx in matrix using Euclidean distance.

Args:
    query_idx: index of the query record in matrix
    matrix: np.ndarray of shape (n, d)
    k: number of neighbors to return
    city_mask: optional boolean array of shape (n,); if provided, only consider
               records where city_mask is True

Returns:
    list of (neighbor_idx, distance) sorted by distance ASC (excluding query itself)
N)axis)r   wherer   arangeshapesqrtsumminargpartitionargsortintfloatr   )	query_idxr    k	city_mask	query_veccandidate_indicescandidate_matrixall_indicesdiff	distancesk_actualtop_k_localresults	local_idxorig_idxdists   &&&&            r&   
knn_searchrC   ?   s)    !IHHY/2-9.LM !Q&I!4 iiQ0'y(@A!4 'Dtqyq12I 1c+,-H//)\:9HEKbjj)?@AKG 	(34Y)*'( !
 N    c                J    V^ 8:  d   R# \        RRW,          ,
          4      pV# )z9Convert Euclidean distance to similarity score in [0, 1].r   g        )max)distancemax_distancescores   && r&   distance_to_scorerJ   n   s&    qS8223ELrD   )
   N)g      $@)__doc__numpyr   r'   rC   rJ    rD   r&   <module>rO      s     41n,^rD   