fastmatch.knn_voyager
1import numpy as np 2import voyager 3import json 4import os 5from datetime import datetime, timezone 6 7 8class VoyagerNearestNeighbors: 9 def __init__(self, space: str = "euclidean", index_type: str = "hnsw", **kwargs): 10 """ 11 NearestNeighbors object using voyager for speed. Identical API to sklearn but faster. 12 Utilizes the `voyager` library https://github.com/spotify/voyager . 13 14 Args: 15 space (str) : Distance metric for finding nearest neighbors 16 (default: "euclidean") 17 index_type (str) : Index type within voyager to use 18 (supported: "hnsw") 19 """ 20 if space.lower() == "euclidean": 21 self.space = voyager.Space.Euclidean 22 else: 23 raise NotImplementedError(f"Space {space} not implemented.") 24 self.space_str = space.lower() 25 self.index_type = index_type 26 27 def fit(self, X: np.ndarray): 28 """Create voyager index and train with data. 29 30 Args: 31 X (np.array): Array of N samples of shape (NxM) 32 33 Returns: 34 self: Fitted object 35 """ 36 self.index_ = voyager.Index(self.space, num_dimensions=X.shape[1]) 37 self.index_.add_items(X) 38 return self 39 40 def kneighbors(self, X: np.ndarray, n_neighbors: int = 1): 41 """Find the k nearest neighbors of each sample in X 42 43 Args: 44 X (np.array): Array of shape (N,M) of samples to search 45 for neighbors of. M must be the same as the fit data. 46 n_neighbors (int, optional): Number of neighbors to find. 47 Defaults to 1. 48 49 Returns: 50 (distances, indices): Two np.array objects of shape (N,n_neighbors) 51 containing the distances and indices of the closest neighbors. 52 """ 53 indices, distances = self.index_.query(X, k=n_neighbors) 54 return distances, indices.astype(np.int64) 55 56 def save(self, filepath: str): 57 """Save Voyager index and metadata to disk for persistence across sessions. 58 59 This saves two files: 60 - {filepath}.index : Binary Voyager index 61 - {filepath}.meta : JSON metadata with hyperparameters 62 63 Args: 64 filepath (str): Base path for saving files (without extension) 65 66 Example: 67 model.save("./indices/control_pop") 68 # Creates: control_pop.index and control_pop.meta 69 """ 70 if not hasattr(self, "index_"): 71 raise ValueError("Model must be fitted before saving. Call fit() first.") 72 73 index_path = f"{filepath}.index" 74 meta_path = f"{filepath}.meta" 75 76 self.index_.save(index_path) 77 78 metadata = { 79 "backend": "voyager", 80 "index_type": self.index_type, 81 "space": self.space_str, 82 "n_samples": len(self.index_), 83 "n_features": self.index_.num_dimensions, 84 "created_at": datetime.now(timezone.utc).isoformat(), 85 "version": "0.2.0", 86 } 87 88 with open(meta_path, "w") as f: 89 json.dump(metadata, f, indent=2) 90 91 @classmethod 92 def load(cls, filepath: str): 93 """Load Voyager index and metadata from disk. 94 95 Args: 96 filepath (str): Base path to load from (without extension) 97 98 Returns: 99 VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries 100 101 Example: 102 model = VoyagerNearestNeighbors.load("./indices/control_pop") 103 distances, indices = model.kneighbors(X_query, k=5) 104 """ 105 index_path = f"{filepath}.index" 106 meta_path = f"{filepath}.meta" 107 108 if not os.path.exists(index_path): 109 raise FileNotFoundError(f"Index file not found: {index_path}") 110 if not os.path.exists(meta_path): 111 raise FileNotFoundError(f"Metadata file not found: {meta_path}") 112 113 with open(meta_path, "r") as f: 114 metadata = json.load(f) 115 116 instance = cls( 117 space=metadata["space"], 118 index_type=metadata["index_type"], 119 ) 120 121 space_enum = voyager.Space.Euclidean if metadata["space"] == "euclidean" else None 122 instance.index_ = voyager.Index.load( 123 index_path, 124 space=space_enum, 125 num_dimensions=metadata["n_features"] 126 ) 127 128 return instance
9class VoyagerNearestNeighbors: 10 def __init__(self, space: str = "euclidean", index_type: str = "hnsw", **kwargs): 11 """ 12 NearestNeighbors object using voyager for speed. Identical API to sklearn but faster. 13 Utilizes the `voyager` library https://github.com/spotify/voyager . 14 15 Args: 16 space (str) : Distance metric for finding nearest neighbors 17 (default: "euclidean") 18 index_type (str) : Index type within voyager to use 19 (supported: "hnsw") 20 """ 21 if space.lower() == "euclidean": 22 self.space = voyager.Space.Euclidean 23 else: 24 raise NotImplementedError(f"Space {space} not implemented.") 25 self.space_str = space.lower() 26 self.index_type = index_type 27 28 def fit(self, X: np.ndarray): 29 """Create voyager index and train with data. 30 31 Args: 32 X (np.array): Array of N samples of shape (NxM) 33 34 Returns: 35 self: Fitted object 36 """ 37 self.index_ = voyager.Index(self.space, num_dimensions=X.shape[1]) 38 self.index_.add_items(X) 39 return self 40 41 def kneighbors(self, X: np.ndarray, n_neighbors: int = 1): 42 """Find the k nearest neighbors of each sample in X 43 44 Args: 45 X (np.array): Array of shape (N,M) of samples to search 46 for neighbors of. M must be the same as the fit data. 47 n_neighbors (int, optional): Number of neighbors to find. 48 Defaults to 1. 49 50 Returns: 51 (distances, indices): Two np.array objects of shape (N,n_neighbors) 52 containing the distances and indices of the closest neighbors. 53 """ 54 indices, distances = self.index_.query(X, k=n_neighbors) 55 return distances, indices.astype(np.int64) 56 57 def save(self, filepath: str): 58 """Save Voyager index and metadata to disk for persistence across sessions. 59 60 This saves two files: 61 - {filepath}.index : Binary Voyager index 62 - {filepath}.meta : JSON metadata with hyperparameters 63 64 Args: 65 filepath (str): Base path for saving files (without extension) 66 67 Example: 68 model.save("./indices/control_pop") 69 # Creates: control_pop.index and control_pop.meta 70 """ 71 if not hasattr(self, "index_"): 72 raise ValueError("Model must be fitted before saving. Call fit() first.") 73 74 index_path = f"{filepath}.index" 75 meta_path = f"{filepath}.meta" 76 77 self.index_.save(index_path) 78 79 metadata = { 80 "backend": "voyager", 81 "index_type": self.index_type, 82 "space": self.space_str, 83 "n_samples": len(self.index_), 84 "n_features": self.index_.num_dimensions, 85 "created_at": datetime.now(timezone.utc).isoformat(), 86 "version": "0.2.0", 87 } 88 89 with open(meta_path, "w") as f: 90 json.dump(metadata, f, indent=2) 91 92 @classmethod 93 def load(cls, filepath: str): 94 """Load Voyager index and metadata from disk. 95 96 Args: 97 filepath (str): Base path to load from (without extension) 98 99 Returns: 100 VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries 101 102 Example: 103 model = VoyagerNearestNeighbors.load("./indices/control_pop") 104 distances, indices = model.kneighbors(X_query, k=5) 105 """ 106 index_path = f"{filepath}.index" 107 meta_path = f"{filepath}.meta" 108 109 if not os.path.exists(index_path): 110 raise FileNotFoundError(f"Index file not found: {index_path}") 111 if not os.path.exists(meta_path): 112 raise FileNotFoundError(f"Metadata file not found: {meta_path}") 113 114 with open(meta_path, "r") as f: 115 metadata = json.load(f) 116 117 instance = cls( 118 space=metadata["space"], 119 index_type=metadata["index_type"], 120 ) 121 122 space_enum = voyager.Space.Euclidean if metadata["space"] == "euclidean" else None 123 instance.index_ = voyager.Index.load( 124 index_path, 125 space=space_enum, 126 num_dimensions=metadata["n_features"] 127 ) 128 129 return instance
10 def __init__(self, space: str = "euclidean", index_type: str = "hnsw", **kwargs): 11 """ 12 NearestNeighbors object using voyager for speed. Identical API to sklearn but faster. 13 Utilizes the `voyager` library https://github.com/spotify/voyager . 14 15 Args: 16 space (str) : Distance metric for finding nearest neighbors 17 (default: "euclidean") 18 index_type (str) : Index type within voyager to use 19 (supported: "hnsw") 20 """ 21 if space.lower() == "euclidean": 22 self.space = voyager.Space.Euclidean 23 else: 24 raise NotImplementedError(f"Space {space} not implemented.") 25 self.space_str = space.lower() 26 self.index_type = index_type
NearestNeighbors object using voyager for speed. Identical API to sklearn but faster.
Utilizes the voyager library https://github.com/spotify/voyager .
Args: space (str) : Distance metric for finding nearest neighbors (default: "euclidean") index_type (str) : Index type within voyager to use (supported: "hnsw")
28 def fit(self, X: np.ndarray): 29 """Create voyager index and train with data. 30 31 Args: 32 X (np.array): Array of N samples of shape (NxM) 33 34 Returns: 35 self: Fitted object 36 """ 37 self.index_ = voyager.Index(self.space, num_dimensions=X.shape[1]) 38 self.index_.add_items(X) 39 return self
Create voyager index and train with data.
Args: X (np.array): Array of N samples of shape (NxM)
Returns: self: Fitted object
41 def kneighbors(self, X: np.ndarray, n_neighbors: int = 1): 42 """Find the k nearest neighbors of each sample in X 43 44 Args: 45 X (np.array): Array of shape (N,M) of samples to search 46 for neighbors of. M must be the same as the fit data. 47 n_neighbors (int, optional): Number of neighbors to find. 48 Defaults to 1. 49 50 Returns: 51 (distances, indices): Two np.array objects of shape (N,n_neighbors) 52 containing the distances and indices of the closest neighbors. 53 """ 54 indices, distances = self.index_.query(X, k=n_neighbors) 55 return distances, indices.astype(np.int64)
Find the k nearest neighbors of each sample in X
Args: X (np.array): Array of shape (N,M) of samples to search for neighbors of. M must be the same as the fit data. n_neighbors (int, optional): Number of neighbors to find. Defaults to 1.
Returns: (distances, indices): Two np.array objects of shape (N,n_neighbors) containing the distances and indices of the closest neighbors.
57 def save(self, filepath: str): 58 """Save Voyager index and metadata to disk for persistence across sessions. 59 60 This saves two files: 61 - {filepath}.index : Binary Voyager index 62 - {filepath}.meta : JSON metadata with hyperparameters 63 64 Args: 65 filepath (str): Base path for saving files (without extension) 66 67 Example: 68 model.save("./indices/control_pop") 69 # Creates: control_pop.index and control_pop.meta 70 """ 71 if not hasattr(self, "index_"): 72 raise ValueError("Model must be fitted before saving. Call fit() first.") 73 74 index_path = f"{filepath}.index" 75 meta_path = f"{filepath}.meta" 76 77 self.index_.save(index_path) 78 79 metadata = { 80 "backend": "voyager", 81 "index_type": self.index_type, 82 "space": self.space_str, 83 "n_samples": len(self.index_), 84 "n_features": self.index_.num_dimensions, 85 "created_at": datetime.now(timezone.utc).isoformat(), 86 "version": "0.2.0", 87 } 88 89 with open(meta_path, "w") as f: 90 json.dump(metadata, f, indent=2)
Save Voyager index and metadata to disk for persistence across sessions.
This saves two files:
- {filepath}.index : Binary Voyager index
- {filepath}.meta : JSON metadata with hyperparameters
Args: filepath (str): Base path for saving files (without extension)
Example: model.save("./indices/control_pop") # Creates: control_pop.index and control_pop.meta
92 @classmethod 93 def load(cls, filepath: str): 94 """Load Voyager index and metadata from disk. 95 96 Args: 97 filepath (str): Base path to load from (without extension) 98 99 Returns: 100 VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries 101 102 Example: 103 model = VoyagerNearestNeighbors.load("./indices/control_pop") 104 distances, indices = model.kneighbors(X_query, k=5) 105 """ 106 index_path = f"{filepath}.index" 107 meta_path = f"{filepath}.meta" 108 109 if not os.path.exists(index_path): 110 raise FileNotFoundError(f"Index file not found: {index_path}") 111 if not os.path.exists(meta_path): 112 raise FileNotFoundError(f"Metadata file not found: {meta_path}") 113 114 with open(meta_path, "r") as f: 115 metadata = json.load(f) 116 117 instance = cls( 118 space=metadata["space"], 119 index_type=metadata["index_type"], 120 ) 121 122 space_enum = voyager.Space.Euclidean if metadata["space"] == "euclidean" else None 123 instance.index_ = voyager.Index.load( 124 index_path, 125 space=space_enum, 126 num_dimensions=metadata["n_features"] 127 ) 128 129 return instance
Load Voyager index and metadata from disk.
Args: filepath (str): Base path to load from (without extension)
Returns: VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries
Example: model = VoyagerNearestNeighbors.load("./indices/control_pop") distances, indices = model.kneighbors(X_query, k=5)