Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
fail-fast: false
matrix:
WEAVIATE_VERSION:
["1.32.24", "1.33.11", "1.34.7", "1.35.2", "1.36.9", "1.37.0-rc.0"]
["1.32.24", "1.33.11", "1.34.7", "1.35.2", "1.36.9", "1.37.1"]
steps:
- uses: actions/checkout@v4

Expand Down
2 changes: 1 addition & 1 deletion src/it/java/io/weaviate/containers/Weaviate.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public enum Version {
V134(1, 34, 7),
V135(1, 35, 2),
V136(1, 36, 9),
V137(1, 37, "0-rc.0");
V137(1, 37, 1);

public final SemanticVersion semver;

Expand Down
41 changes: 41 additions & 0 deletions src/it/java/io/weaviate/integration/TokenizeITest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package io.weaviate.integration;

import org.assertj.core.api.Assertions;
import org.junit.BeforeClass;
import org.junit.Test;

import io.weaviate.ConcurrentTest;
import io.weaviate.client6.v1.api.WeaviateClient;
import io.weaviate.client6.v1.api.collections.Property;
import io.weaviate.client6.v1.api.collections.Tokenization;
import io.weaviate.containers.Container;
import io.weaviate.containers.Weaviate;

public class TokenizeITest extends ConcurrentTest {
private static final WeaviateClient client = Container.WEAVIATE.getClient();

@BeforeClass
public static void __() {
Weaviate.Version.V137.orSkip();
}

@Test
public void testTokenize() throws Exception {
var nsWords = ns("Words");
client.collections.create(nsWords,
c -> c.properties(Property.text("sentence",
p -> p.tokenization(Tokenization.TRIGRAM))));

var sentence = "hello world";

// Act
var custom = client.tokenize.text(sentence,
tok -> tok.tokenization(Tokenization.TRIGRAM));

var existing = client.tokenize.text(sentence,
nsWords, "sentence");

// Assert
Assertions.assertThat(existing).isEqualTo(custom);
}
}
8 changes: 8 additions & 0 deletions src/main/java/io/weaviate/client6/v1/api/WeaviateClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import io.weaviate.client6.v1.api.rbac.groups.WeaviateGroupsClient;
import io.weaviate.client6.v1.api.rbac.roles.WeaviateRolesClient;
import io.weaviate.client6.v1.api.rbac.users.WeaviateUsersClient;
import io.weaviate.client6.v1.api.tokenize.WeaviateTokenizeClient;
import io.weaviate.client6.v1.internal.ObjectBuilder;
import io.weaviate.client6.v1.internal.Timeout;
import io.weaviate.client6.v1.internal.TokenProvider;
Expand Down Expand Up @@ -62,6 +63,12 @@ public class WeaviateClient implements AutoCloseable {
*/
public final WeaviateClusterClient cluster;

/**
* Client for {@code /tokenize} and
* {@code /schema/{collection}/property/{property}/tokenize} endpoints.
*/
public final WeaviateTokenizeClient tokenize;

public WeaviateClient(Config config) {
RestTransportOptions restOpt = config.restTransportOptions();
GrpcChannelOptions grpcOpt;
Expand Down Expand Up @@ -117,6 +124,7 @@ public WeaviateClient(Config config) {
this.grpcTransport = new DefaultGrpcTransport(grpcOpt);
this.alias = new WeaviateAliasClient(restTransport);
this.backup = new WeaviateBackupClient(restTransport);
this.tokenize = new WeaviateTokenizeClient(restTransport);
this.collections = new WeaviateCollectionsClient(restTransport, grpcTransport);
this.roles = new WeaviateRolesClient(restTransport);
this.groups = new WeaviateGroupsClient(restTransport);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import io.weaviate.client6.v1.api.rbac.groups.WeaviateGroupsClientAsync;
import io.weaviate.client6.v1.api.rbac.roles.WeaviateRolesClientAsync;
import io.weaviate.client6.v1.api.rbac.users.WeaviateUsersClientAsync;
import io.weaviate.client6.v1.api.tokenize.WeaviateTokenizeClientAsync;
import io.weaviate.client6.v1.internal.ObjectBuilder;
import io.weaviate.client6.v1.internal.Timeout;
import io.weaviate.client6.v1.internal.TokenProvider;
Expand Down Expand Up @@ -61,6 +62,12 @@ public class WeaviateClientAsync implements AutoCloseable {
*/
public final WeaviateClusterClientAsync cluster;

/**
* Client for {@code /tokenize} and
* {@code /schema/{collection}/property/{property}/tokenize} endpoints.
*/
public final WeaviateTokenizeClientAsync tokenize;

/**
* This constructor is blocking if {@link Authentication} configured,
* as the client will need to do the initial token exchange.
Expand Down Expand Up @@ -121,6 +128,7 @@ public WeaviateClientAsync(Config config) {
this.grpcTransport = new DefaultGrpcTransport(grpcOpt);
this.alias = new WeaviateAliasClientAsync(restTransport);
this.backup = new WeaviateBackupClientAsync(restTransport);
this.tokenize = new WeaviateTokenizeClientAsync(restTransport);
this.roles = new WeaviateRolesClientAsync(restTransport);
this.groups = new WeaviateGroupsClientAsync(restTransport);
this.users = new WeaviateUsersClientAsync(restTransport);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package io.weaviate.client6.v1.api.collections;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

import com.google.gson.annotations.SerializedName;
Expand All @@ -15,6 +17,7 @@ public record InvertedIndex(
@SerializedName("bm25") Bm25 bm25,
/** Common words which should be ignored in queries. */
@SerializedName("stopwords") Stopwords stopwords,
@SerializedName("stopwordPresets") Map<String, List<String>> stopwordPresets,
/**
* If true, indexes object creation and update timestamps,
* enabling filtering by creationTimeUnix and lastUpdateTimeUnix.
Expand Down Expand Up @@ -135,6 +138,7 @@ public InvertedIndex(Builder builder) {
builder.cleanupIntervalSeconds,
builder.bm25,
builder.stopwords,
builder.stopwordPresets,
builder.indexTimestamps,
builder.indexNulls,
builder.indexPropertyLength,
Expand All @@ -145,6 +149,7 @@ public static class Builder implements ObjectBuilder<InvertedIndex> {
private Integer cleanupIntervalSeconds;
private Bm25 bm25;
private Stopwords stopwords;
private Map<String, List<String>> stopwordPresets = new HashMap<>();
private Boolean indexTimestamps;
private Boolean indexNulls;
private Boolean indexPropertyLength;
Expand All @@ -168,6 +173,12 @@ public Builder stopwords(Function<Stopwords.Builder, ObjectBuilder<Stopwords>> f
return this;
}

/** Supply custom stopword presets. */
public Builder stopwordPresets(Map<String, List<String>> stopwordPresets) {
this.stopwordPresets = stopwordPresets;
return this;
}

/**
* Enable / disable creating an index for creation / update timestamps.
*
Expand Down
17 changes: 17 additions & 0 deletions src/main/java/io/weaviate/client6/v1/api/collections/Property.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public record Property(
@SerializedName("indexRangeFilters") Boolean indexRangeFilters,
@SerializedName("indexSearchable") Boolean indexSearchable,
@SerializedName("tokenization") Tokenization tokenization,
@SerializedName("textAnalyzer") TextAnalyzer textAnalyzer,
@SerializedName("skipVectorization") Boolean skipVectorization,
@SerializedName("vectorizePropertyName") Boolean vectorizePropertyName,
@SerializedName("nestedProperties") List<Property> nestedProperties) {
Expand Down Expand Up @@ -407,6 +408,7 @@ public Property(Builder builder) {
builder.indexRangeFilters,
builder.indexSearchable,
builder.tokenization,
builder.textAnalyzer,
builder.skipVectorization,
builder.vectorizePropertyName,
builder.nestedProperties.isEmpty() ? null : builder.nestedProperties);
Expand Down Expand Up @@ -435,6 +437,7 @@ public static class Builder implements ObjectBuilder<Property> {
private Boolean indexRangeFilters;
private Boolean indexSearchable;
private Tokenization tokenization;
private TextAnalyzer textAnalyzer;
private Boolean skipVectorization;
private Boolean vectorizePropertyName;
private List<Property> nestedProperties = new ArrayList<>();
Expand Down Expand Up @@ -555,6 +558,20 @@ public Builder tokenization(Tokenization tokenization) {
return this;
}

/**
* Configures per-property text analysis for {@code text} and {@code text[]}
* properties that use an inverted index (searchable or filterable).
*
* <p>
* Supports ASCII folding (accent/diacritic handling) and selecting
* a stopword preset that overrides the collection-level
* {@code invertedIndexConfig.stopwords} setting for this property only.
*/
public Builder textAnalyzer(TextAnalyzer textAnalyzer) {
this.textAnalyzer = textAnalyzer;
return this;
}

public Builder skipVectorization(boolean skipVectorization) {
this.skipVectorization = skipVectorization;
return this;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package io.weaviate.client6.v1.api.collections;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.function.Function;

import com.google.gson.annotations.SerializedName;

import io.weaviate.client6.v1.internal.ObjectBuilder;

public record TextAnalyzer(
@SerializedName("ascii_fold") Boolean foldAscii,
@SerializedName("ascii_fold_ignore") List<String> keepAscii,
@SerializedName("stopword_preset") String stopwordPreset) {

public static TextAnalyzer of() {
return null;
}

public static TextAnalyzer of(Function<Builder, ObjectBuilder<TextAnalyzer>> fn) {
return fn.apply(new Builder()).build();
}

public TextAnalyzer(Builder builder) {
this(
builder.foldAscii,
builder.keepAscii,
builder.stopwordPreset);
}

public static class Builder implements ObjectBuilder<TextAnalyzer> {
Boolean foldAscii = true;
List<String> keepAscii = new ArrayList<>();
String stopwordPreset;

public Builder foldAscii(boolean enable) {
this.foldAscii = enable;
return this;
}

public Builder keepAscii(String... keepAscii) {
return keepAscii(Arrays.asList(keepAscii));
}

public Builder keepAscii(List<String> keepAscii) {
this.keepAscii = keepAscii;
return this;
}

public Builder stopwordPreset(String stopwordPreset) {
this.stopwordPreset = stopwordPreset;
return this;
}

@Override
public TextAnalyzer build() {
return new TextAnalyzer(this);
}
}
}
Loading
Loading