Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
*/
package io.github.jbellis.jvector.bench;

import io.github.jbellis.jvector.example.SiftSmall;
import io.github.jbellis.jvector.example.util.SiftLoader;
import io.github.jbellis.jvector.graph.*;
import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider;
import io.github.jbellis.jvector.util.Bits;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import org.openjdk.jmh.annotations.*;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/
package io.github.jbellis.jvector.bench;

import io.github.jbellis.jvector.example.SiftSmall;
import io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall;
import io.github.jbellis.jvector.example.util.SiftLoader;
import io.github.jbellis.jvector.graph.*;
import io.github.jbellis.jvector.graph.similarity.BuildScoreProvider;
Expand Down
6 changes: 3 additions & 3 deletions jvector-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@
<argument>-classpath</argument>
<classpath/>
<argument>-ea</argument>
<argument>io.github.jbellis.jvector.example.SiftSmall</argument>
<argument>io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall</argument>
</arguments>
</configuration>
</execution>
Expand Down Expand Up @@ -212,7 +212,7 @@
<classpath/>
<argument>--add-modules=jdk.incubator.vector</argument>
<argument>-ea</argument>
<argument>io.github.jbellis.jvector.example.SiftSmall</argument>
<argument>io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall</argument>
</arguments>
</configuration>
</execution>
Expand Down Expand Up @@ -306,7 +306,7 @@
<argument>--add-modules=jdk.incubator.vector</argument>
<argument>-ea</argument>
<argument>-Djvector.experimental.enable_native_vectorization=true</argument>
<argument>io.github.jbellis.jvector.example.SiftSmall</argument>
<argument>io.github.jbellis.jvector.example.benchmarks.datasets.SiftSmall</argument>
</arguments>
</configuration>
</execution>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer;
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
import io.github.jbellis.jvector.example.util.CheckpointManager;
import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.DataSetLoader;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
import io.github.jbellis.jvector.example.yaml.MultiConfig;

import org.slf4j.Logger;
Expand Down Expand Up @@ -130,7 +130,9 @@ public static void main(String[] args) throws IOException {

logger.info("Loading dataset: {}", datasetName);
try {
DataSet ds = DataSetLoader.loadDataSet(datasetName);
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new IllegalStateException("Dataset " + datasetName + " not found")
);
logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.getBaseVectors().size());

String normalizedDatasetName = datasetName;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

import io.github.jbellis.jvector.example.util.CompressorParameters;
import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.DataSetLoader;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
Expand Down Expand Up @@ -90,7 +90,9 @@ private static void execute(Pattern pattern, List<Function<DataSet, CompressorPa
System.out.println("Executing the following datasets: " + datasetNames);

for (var datasetName : datasetNames) {
DataSet ds = DataSetLoader.loadDataSet(datasetName);
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Dataset " + datasetName + " not found")
);
Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
}
}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

package io.github.jbellis.jvector.example;

import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.DataSetLoader;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
import io.github.jbellis.jvector.example.yaml.MultiConfig;

Expand Down Expand Up @@ -52,12 +52,15 @@ public static void main(String[] args) throws IOException {
if (!datasetNames.isEmpty()) {
System.out.println("Executing the following datasets: " + datasetNames);

for (var datasetName : datasetNames) {
DataSet ds = DataSetLoader.loadDataSet(datasetName);
String hdf5 = ".hdf5";
for (var rawname : datasetNames) {
String datasetName =
rawname.endsWith(hdf5) ? rawname.substring(0, rawname.length() - hdf5.length() -1) : rawname;
// pre-loading and early error phase
DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Could not load dataset:" + datasetName)
);

if (datasetName.endsWith(".hdf5")) {
datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
}
MultiConfig config = MultiConfig.getDefaultConfig(datasetName);
allConfigs.add(config);
}
Expand All @@ -76,7 +79,9 @@ public static void main(String[] args) throws IOException {
for (var config : allConfigs) {
String datasetName = config.dataset;

DataSet ds = DataSetLoader.loadDataSet(datasetName);
DataSet ds = DataSets.loadDataSet(datasetName).orElseThrow(
() -> new RuntimeException("Could not load dataset:" + datasetName)
);

Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import io.github.jbellis.jvector.example.benchmarks.ThroughputBenchmark;
import io.github.jbellis.jvector.example.benchmarks.diagnostics.DiagnosticLevel;
import io.github.jbellis.jvector.example.util.CompressorParameters;
import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet;
import io.github.jbellis.jvector.example.util.FilteredForkJoinPool;
import io.github.jbellis.jvector.graph.ImmutableGraphIndex;
import io.github.jbellis.jvector.graph.GraphIndexBuilder;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

package io.github.jbellis.jvector.example;

import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.DownloadHelper;
import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD;
import io.github.jbellis.jvector.example.yaml.MultiConfig;

import java.io.IOException;
Expand All @@ -28,14 +27,10 @@
public class HelloVectorWorld {
public static void main(String[] args) throws IOException {
System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());

String datasetName = "ada002-100k";

var mfd = DownloadHelper.maybeDownloadFvecs(datasetName);
DataSet ds = mfd.load();

var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
.orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"));
MultiConfig config = MultiConfig.getConfig(datasetName);

Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
config.construction.getFeatureSets(), config.construction.getCompressorParameters(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package io.github.jbellis.jvector.example.util;
package io.github.jbellis.jvector.example.benchmarks.datasets;

import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright DataStax, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.github.jbellis.jvector.example.benchmarks.datasets;

import java.util.Optional;

/**
* A DataSet Loader, which makes dataset sources modular and configurable without breaking existing callers.
*/
public interface DataSetLoader {
/**
* Implementations of this method <EM>MUST NOT</EM> throw exceptions related to the presence or absence of a
* requested dataset. Instead, {@link Optional} should be used. Other errors should still be indicated with
* exceptions as usual, including any errors loading a dataset which has been found. Implementors should reliably
* return from this method, avoiding any {@link System#exit(int)} or similar calls.
*
* <HR/>
*
* Implementations are encouraged to include logging at debug level for diagnostics, such as when datasets are
* not found, and info level for when datasets are found and loaded. This can assist users troubleshooting
* diverse data sources.
*
* @param dataSetName
* @return a {@link DataSet}, if found
*/
Optional<DataSet> loadDataSet(String dataSetName);
}
Loading