fastmatch.knn_voyager

  1import numpy as np
  2import voyager
  3import json
  4import os
  5from datetime import datetime, timezone
  6
  7
  8class VoyagerNearestNeighbors:
  9    def __init__(self, space: str = "euclidean", index_type: str = "hnsw", **kwargs):
 10        """
 11        NearestNeighbors object using voyager for speed. Identical API to sklearn but faster.
 12        Utilizes the `voyager` library https://github.com/spotify/voyager .
 13
 14        Args:
 15            space (str) :  Distance metric for finding nearest neighbors
 16                (default: "euclidean")
 17            index_type (str) : Index type within voyager to use
 18                (supported: "hnsw")
 19        """
 20        if space.lower() == "euclidean":
 21            self.space = voyager.Space.Euclidean
 22        else:
 23            raise NotImplementedError(f"Space {space} not implemented.")
 24        self.space_str = space.lower()
 25        self.index_type = index_type
 26
 27    def fit(self, X: np.ndarray):
 28        """Create voyager index and train with data.
 29
 30        Args:
 31            X (np.array): Array of N samples of shape (NxM)
 32
 33        Returns:
 34            self: Fitted object
 35        """
 36        self.index_ = voyager.Index(self.space, num_dimensions=X.shape[1])
 37        self.index_.add_items(X)
 38        return self
 39
 40    def kneighbors(self, X: np.ndarray, n_neighbors: int = 1):
 41        """Find the k nearest neighbors of each sample in X
 42
 43        Args:
 44            X (np.array):  Array of shape (N,M) of samples to search
 45                for neighbors of. M must be the same as the fit data.
 46            n_neighbors (int, optional): Number of neighbors to find.
 47                Defaults to 1.
 48
 49        Returns:
 50            (distances, indices): Two np.array objects of shape (N,n_neighbors)
 51                containing the distances and indices of the closest neighbors.
 52        """
 53        indices, distances = self.index_.query(X, k=n_neighbors)
 54        return distances, indices.astype(np.int64)
 55
 56    def save(self, filepath: str):
 57        """Save Voyager index and metadata to disk for persistence across sessions.
 58
 59        This saves two files:
 60        - {filepath}.index : Binary Voyager index
 61        - {filepath}.meta : JSON metadata with hyperparameters
 62
 63        Args:
 64            filepath (str): Base path for saving files (without extension)
 65
 66        Example:
 67            model.save("./indices/control_pop")
 68            # Creates: control_pop.index and control_pop.meta
 69        """
 70        if not hasattr(self, "index_"):
 71            raise ValueError("Model must be fitted before saving. Call fit() first.")
 72
 73        index_path = f"{filepath}.index"
 74        meta_path = f"{filepath}.meta"
 75
 76        self.index_.save(index_path)
 77
 78        metadata = {
 79            "backend": "voyager",
 80            "index_type": self.index_type,
 81            "space": self.space_str,
 82            "n_samples": len(self.index_),
 83            "n_features": self.index_.num_dimensions,
 84            "created_at": datetime.now(timezone.utc).isoformat(),
 85            "version": "0.2.0",
 86        }
 87
 88        with open(meta_path, "w") as f:
 89            json.dump(metadata, f, indent=2)
 90
 91    @classmethod
 92    def load(cls, filepath: str):
 93        """Load Voyager index and metadata from disk.
 94
 95        Args:
 96            filepath (str): Base path to load from (without extension)
 97
 98        Returns:
 99            VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries
100
101        Example:
102            model = VoyagerNearestNeighbors.load("./indices/control_pop")
103            distances, indices = model.kneighbors(X_query, k=5)
104        """
105        index_path = f"{filepath}.index"
106        meta_path = f"{filepath}.meta"
107
108        if not os.path.exists(index_path):
109            raise FileNotFoundError(f"Index file not found: {index_path}")
110        if not os.path.exists(meta_path):
111            raise FileNotFoundError(f"Metadata file not found: {meta_path}")
112
113        with open(meta_path, "r") as f:
114            metadata = json.load(f)
115
116        instance = cls(
117            space=metadata["space"],
118            index_type=metadata["index_type"],
119        )
120
121        space_enum = voyager.Space.Euclidean if metadata["space"] == "euclidean" else None
122        instance.index_ = voyager.Index.load(
123            index_path,
124            space=space_enum,
125            num_dimensions=metadata["n_features"]
126        )
127
128        return instance
class VoyagerNearestNeighbors:
  9class VoyagerNearestNeighbors:
 10    def __init__(self, space: str = "euclidean", index_type: str = "hnsw", **kwargs):
 11        """
 12        NearestNeighbors object using voyager for speed. Identical API to sklearn but faster.
 13        Utilizes the `voyager` library https://github.com/spotify/voyager .
 14
 15        Args:
 16            space (str) :  Distance metric for finding nearest neighbors
 17                (default: "euclidean")
 18            index_type (str) : Index type within voyager to use
 19                (supported: "hnsw")
 20        """
 21        if space.lower() == "euclidean":
 22            self.space = voyager.Space.Euclidean
 23        else:
 24            raise NotImplementedError(f"Space {space} not implemented.")
 25        self.space_str = space.lower()
 26        self.index_type = index_type
 27
 28    def fit(self, X: np.ndarray):
 29        """Create voyager index and train with data.
 30
 31        Args:
 32            X (np.array): Array of N samples of shape (NxM)
 33
 34        Returns:
 35            self: Fitted object
 36        """
 37        self.index_ = voyager.Index(self.space, num_dimensions=X.shape[1])
 38        self.index_.add_items(X)
 39        return self
 40
 41    def kneighbors(self, X: np.ndarray, n_neighbors: int = 1):
 42        """Find the k nearest neighbors of each sample in X
 43
 44        Args:
 45            X (np.array):  Array of shape (N,M) of samples to search
 46                for neighbors of. M must be the same as the fit data.
 47            n_neighbors (int, optional): Number of neighbors to find.
 48                Defaults to 1.
 49
 50        Returns:
 51            (distances, indices): Two np.array objects of shape (N,n_neighbors)
 52                containing the distances and indices of the closest neighbors.
 53        """
 54        indices, distances = self.index_.query(X, k=n_neighbors)
 55        return distances, indices.astype(np.int64)
 56
 57    def save(self, filepath: str):
 58        """Save Voyager index and metadata to disk for persistence across sessions.
 59
 60        This saves two files:
 61        - {filepath}.index : Binary Voyager index
 62        - {filepath}.meta : JSON metadata with hyperparameters
 63
 64        Args:
 65            filepath (str): Base path for saving files (without extension)
 66
 67        Example:
 68            model.save("./indices/control_pop")
 69            # Creates: control_pop.index and control_pop.meta
 70        """
 71        if not hasattr(self, "index_"):
 72            raise ValueError("Model must be fitted before saving. Call fit() first.")
 73
 74        index_path = f"{filepath}.index"
 75        meta_path = f"{filepath}.meta"
 76
 77        self.index_.save(index_path)
 78
 79        metadata = {
 80            "backend": "voyager",
 81            "index_type": self.index_type,
 82            "space": self.space_str,
 83            "n_samples": len(self.index_),
 84            "n_features": self.index_.num_dimensions,
 85            "created_at": datetime.now(timezone.utc).isoformat(),
 86            "version": "0.2.0",
 87        }
 88
 89        with open(meta_path, "w") as f:
 90            json.dump(metadata, f, indent=2)
 91
 92    @classmethod
 93    def load(cls, filepath: str):
 94        """Load Voyager index and metadata from disk.
 95
 96        Args:
 97            filepath (str): Base path to load from (without extension)
 98
 99        Returns:
100            VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries
101
102        Example:
103            model = VoyagerNearestNeighbors.load("./indices/control_pop")
104            distances, indices = model.kneighbors(X_query, k=5)
105        """
106        index_path = f"{filepath}.index"
107        meta_path = f"{filepath}.meta"
108
109        if not os.path.exists(index_path):
110            raise FileNotFoundError(f"Index file not found: {index_path}")
111        if not os.path.exists(meta_path):
112            raise FileNotFoundError(f"Metadata file not found: {meta_path}")
113
114        with open(meta_path, "r") as f:
115            metadata = json.load(f)
116
117        instance = cls(
118            space=metadata["space"],
119            index_type=metadata["index_type"],
120        )
121
122        space_enum = voyager.Space.Euclidean if metadata["space"] == "euclidean" else None
123        instance.index_ = voyager.Index.load(
124            index_path,
125            space=space_enum,
126            num_dimensions=metadata["n_features"]
127        )
128
129        return instance
VoyagerNearestNeighbors(space: str = 'euclidean', index_type: str = 'hnsw', **kwargs)
10    def __init__(self, space: str = "euclidean", index_type: str = "hnsw", **kwargs):
11        """
12        NearestNeighbors object using voyager for speed. Identical API to sklearn but faster.
13        Utilizes the `voyager` library https://github.com/spotify/voyager .
14
15        Args:
16            space (str) :  Distance metric for finding nearest neighbors
17                (default: "euclidean")
18            index_type (str) : Index type within voyager to use
19                (supported: "hnsw")
20        """
21        if space.lower() == "euclidean":
22            self.space = voyager.Space.Euclidean
23        else:
24            raise NotImplementedError(f"Space {space} not implemented.")
25        self.space_str = space.lower()
26        self.index_type = index_type

NearestNeighbors object using voyager for speed. Identical API to sklearn but faster. Utilizes the voyager library https://github.com/spotify/voyager .

Args: space (str) : Distance metric for finding nearest neighbors (default: "euclidean") index_type (str) : Index type within voyager to use (supported: "hnsw")

space_str
index_type
def fit(self, X: numpy.ndarray):
28    def fit(self, X: np.ndarray):
29        """Create voyager index and train with data.
30
31        Args:
32            X (np.array): Array of N samples of shape (NxM)
33
34        Returns:
35            self: Fitted object
36        """
37        self.index_ = voyager.Index(self.space, num_dimensions=X.shape[1])
38        self.index_.add_items(X)
39        return self

Create voyager index and train with data.

Args: X (np.array): Array of N samples of shape (NxM)

Returns: self: Fitted object

def kneighbors(self, X: numpy.ndarray, n_neighbors: int = 1):
41    def kneighbors(self, X: np.ndarray, n_neighbors: int = 1):
42        """Find the k nearest neighbors of each sample in X
43
44        Args:
45            X (np.array):  Array of shape (N,M) of samples to search
46                for neighbors of. M must be the same as the fit data.
47            n_neighbors (int, optional): Number of neighbors to find.
48                Defaults to 1.
49
50        Returns:
51            (distances, indices): Two np.array objects of shape (N,n_neighbors)
52                containing the distances and indices of the closest neighbors.
53        """
54        indices, distances = self.index_.query(X, k=n_neighbors)
55        return distances, indices.astype(np.int64)

Find the k nearest neighbors of each sample in X

Args: X (np.array): Array of shape (N,M) of samples to search for neighbors of. M must be the same as the fit data. n_neighbors (int, optional): Number of neighbors to find. Defaults to 1.

Returns: (distances, indices): Two np.array objects of shape (N,n_neighbors) containing the distances and indices of the closest neighbors.

def save(self, filepath: str):
57    def save(self, filepath: str):
58        """Save Voyager index and metadata to disk for persistence across sessions.
59
60        This saves two files:
61        - {filepath}.index : Binary Voyager index
62        - {filepath}.meta : JSON metadata with hyperparameters
63
64        Args:
65            filepath (str): Base path for saving files (without extension)
66
67        Example:
68            model.save("./indices/control_pop")
69            # Creates: control_pop.index and control_pop.meta
70        """
71        if not hasattr(self, "index_"):
72            raise ValueError("Model must be fitted before saving. Call fit() first.")
73
74        index_path = f"{filepath}.index"
75        meta_path = f"{filepath}.meta"
76
77        self.index_.save(index_path)
78
79        metadata = {
80            "backend": "voyager",
81            "index_type": self.index_type,
82            "space": self.space_str,
83            "n_samples": len(self.index_),
84            "n_features": self.index_.num_dimensions,
85            "created_at": datetime.now(timezone.utc).isoformat(),
86            "version": "0.2.0",
87        }
88
89        with open(meta_path, "w") as f:
90            json.dump(metadata, f, indent=2)

Save Voyager index and metadata to disk for persistence across sessions.

This saves two files:

  • {filepath}.index : Binary Voyager index
  • {filepath}.meta : JSON metadata with hyperparameters

Args: filepath (str): Base path for saving files (without extension)

Example: model.save("./indices/control_pop") # Creates: control_pop.index and control_pop.meta

@classmethod
def load(cls, filepath: str):
 92    @classmethod
 93    def load(cls, filepath: str):
 94        """Load Voyager index and metadata from disk.
 95
 96        Args:
 97            filepath (str): Base path to load from (without extension)
 98
 99        Returns:
100            VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries
101
102        Example:
103            model = VoyagerNearestNeighbors.load("./indices/control_pop")
104            distances, indices = model.kneighbors(X_query, k=5)
105        """
106        index_path = f"{filepath}.index"
107        meta_path = f"{filepath}.meta"
108
109        if not os.path.exists(index_path):
110            raise FileNotFoundError(f"Index file not found: {index_path}")
111        if not os.path.exists(meta_path):
112            raise FileNotFoundError(f"Metadata file not found: {meta_path}")
113
114        with open(meta_path, "r") as f:
115            metadata = json.load(f)
116
117        instance = cls(
118            space=metadata["space"],
119            index_type=metadata["index_type"],
120        )
121
122        space_enum = voyager.Space.Euclidean if metadata["space"] == "euclidean" else None
123        instance.index_ = voyager.Index.load(
124            index_path,
125            space=space_enum,
126            num_dimensions=metadata["n_features"]
127        )
128
129        return instance

Load Voyager index and metadata from disk.

Args: filepath (str): Base path to load from (without extension)

Returns: VoyagerNearestNeighbors: Loaded model ready for kneighbors() queries

Example: model = VoyagerNearestNeighbors.load("./indices/control_pop") distances, indices = model.kneighbors(X_query, k=5)