Skip to content

Commit bcc6929

Browse files
committed
merge
2 parents e59cab5 + 13095c9 commit bcc6929

8 files changed

Lines changed: 1140 additions & 1043 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,8 @@ jobs:
99
runs-on: ${{ matrix.os }}
1010
strategy:
1111
matrix:
12-
os: ["ubuntu-latest", "windows-latest"]
12+
os: ["ubuntu-latest"]
1313
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
14-
exclude:
15-
- os: windows-latest
16-
python-version: "3.9"
17-
- os: windows-latest
18-
python-version: "3.11"
19-
- os: windows-latest
20-
python-version: "3.12"
21-
- os: windows-latest
22-
python-version: "3.13"
2314
fail-fast: false
2415

2516
steps:
@@ -31,19 +22,7 @@ jobs:
3122
python-version: ${{ matrix.python-version }}
3223
allow-prereleases: true
3324

34-
# Step for Windows: Create and activate a virtual environment
35-
- name: Create and activate a virtual environment (Windows)
36-
if: ${{ runner.os == 'Windows' }}
37-
run: |
38-
irm https://astral.sh/uv/install.ps1 | iex
39-
$env:Path = "C:\Users\runneradmin\.local\bin;$env:Path"
40-
uv venv .venv
41-
"VIRTUAL_ENV=.venv" | Out-File -FilePath $env:GITHUB_ENV -Append
42-
"$PWD/.venv/Scripts" | Out-File -FilePath $env:GITHUB_PATH -Append
43-
44-
# Step for Unix: Create and activate a virtual environment
4525
- name: Create and activate a virtual environment (Unix)
46-
if: ${{ runner.os != 'Windows' }}
4726
run: |
4827
curl -LsSf https://astral.sh/uv/install.sh | sh
4928
uv venv .venv

model2vec/hf_utils.py

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010
import safetensors
1111
from huggingface_hub import ModelCard, ModelCardData
12+
from huggingface_hub.constants import HF_HUB_CACHE
1213
from safetensors.numpy import save_file
1314
from tokenizers import Tokenizer
1415

@@ -107,9 +108,10 @@ def _create_model_card(
107108

108109
def load_pretrained(
109110
folder_or_repo_path: str | Path,
110-
subfolder: str | None = None,
111-
token: str | None = None,
112-
from_sentence_transformers: bool = False,
111+
subfolder: str | None,
112+
token: str | None,
113+
from_sentence_transformers: bool,
114+
force_download: bool,
113115
) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any], np.ndarray | None, np.ndarray | None]:
114116
"""
115117
Loads a pretrained model from a folder.
@@ -120,8 +122,10 @@ def load_pretrained(
120122
:param subfolder: The subfolder to load from.
121123
:param token: The huggingface token to use.
122124
:param from_sentence_transformers: Whether to load the model from a sentence transformers model.
125+
:param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
126+
already present in the cache.
123127
:raises: FileNotFoundError if the folder exists, but the file does not exist locally.
124-
:return: The embeddings, tokenizer, config, and metadata.
128+
:return: The embeddings, tokenizer, config, metadata, weights and mapping.
125129
126130
"""
127131
if from_sentence_transformers:
@@ -133,7 +137,13 @@ def load_pretrained(
133137
tokenizer_file = "tokenizer.json"
134138
config_name = "config.json"
135139

136-
folder_or_repo_path = Path(folder_or_repo_path)
140+
cached_folder = _get_latest_model_path(str(folder_or_repo_path))
141+
if cached_folder and not force_download:
142+
logger.info(f"Found cached model at {cached_folder}, loading from cache.")
143+
folder_or_repo_path = cached_folder
144+
else:
145+
logger.info(f"No cached model found for {folder_or_repo_path}, loading from local or hub.")
146+
folder_or_repo_path = Path(folder_or_repo_path)
137147

138148
local_folder = folder_or_repo_path / subfolder if subfolder else folder_or_repo_path
139149

@@ -150,9 +160,7 @@ def load_pretrained(
150160
if not tokenizer_path.exists():
151161
raise FileNotFoundError(f"Tokenizer file does not exist in {local_folder}")
152162

153-
# README is optional, so this is a bit finicky.
154163
readme_path = local_folder / "README.md"
155-
metadata = _get_metadata_from_readme(readme_path)
156164

157165
else:
158166
logger.info("Folder does not exist locally, attempting to use huggingface hub.")
@@ -161,18 +169,11 @@ def load_pretrained(
161169
folder_or_repo_path.as_posix(), model_file, token=token, subfolder=subfolder
162170
)
163171
)
164-
165-
try:
166-
readme_path = Path(
167-
huggingface_hub.hf_hub_download(
168-
folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
169-
)
172+
readme_path = Path(
173+
huggingface_hub.hf_hub_download(
174+
folder_or_repo_path.as_posix(), "README.md", token=token, subfolder=subfolder
170175
)
171-
metadata = _get_metadata_from_readme(Path(readme_path))
172-
except Exception as e:
173-
# NOTE: we don't want to raise an error here, since the README is optional.
174-
logger.info(f"No README found in the model folder: {e} No model card loaded.")
175-
metadata = {}
176+
)
176177

177178
config_path = Path(
178179
huggingface_hub.hf_hub_download(
@@ -186,21 +187,22 @@ def load_pretrained(
186187
)
187188

188189
opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
189-
if from_sentence_transformers:
190-
embeddings = opened_tensor_file.get_tensor("embedding.weight")
190+
embedding_name = "embedding.weight" if from_sentence_transformers else "embeddings"
191+
embeddings = opened_tensor_file.get_tensor(embedding_name)
192+
try:
193+
weights = opened_tensor_file.get_tensor("weights")
194+
except Exception:
195+
# Bare except because safetensors does not export its own errors.
191196
weights = None
197+
try:
198+
mapping = opened_tensor_file.get_tensor("mapping")
199+
except Exception:
192200
mapping = None
201+
202+
if readme_path.exists():
203+
metadata = _get_metadata_from_readme(readme_path)
193204
else:
194-
embeddings = opened_tensor_file.get_tensor("embeddings")
195-
try:
196-
weights = opened_tensor_file.get_tensor("weights")
197-
except Exception:
198-
# Bare except because safetensors does not export its own errors.
199-
weights = None
200-
try:
201-
mapping = opened_tensor_file.get_tensor("mapping")
202-
except Exception:
203-
mapping = None
205+
metadata = {}
204206

205207
tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
206208
config = json.load(open(config_path))
@@ -240,3 +242,28 @@ def push_folder_to_hub(
240242
huggingface_hub.upload_folder(repo_id=repo_id, folder_path=folder_path, token=token, path_in_repo=subfolder)
241243

242244
logger.info(f"Pushed model to {repo_id}")
245+
246+
247+
def _get_latest_model_path(model_id: str) -> Path | None:
248+
"""
249+
Gets the latest model path for a given identifier from the hugging face hub cache.
250+
251+
Returns None if there is no cached model. In this case, the model will be downloaded.
252+
"""
253+
# Make path object
254+
cache_dir = Path(HF_HUB_CACHE)
255+
# This is specific to how HF stores the files.
256+
normalized = model_id.replace("/", "--")
257+
repo_dir = cache_dir / f"models--{normalized}" / "snapshots"
258+
259+
if not repo_dir.exists():
260+
return None
261+
262+
# Find all directories.
263+
snapshots = [p for p in repo_dir.iterdir() if p.is_dir()]
264+
if not snapshots:
265+
return None
266+
267+
# Get the latest directory by modification time.
268+
latest_snapshot = max(snapshots, key=lambda p: p.stat().st_mtime)
269+
return latest_snapshot

model2vec/model.py

Lines changed: 18 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from tokenizers import Encoding, Tokenizer
1313
from tqdm import tqdm
1414

15-
from model2vec.quantization import DType
16-
from model2vec.utils import ProgressParallel, load_local_model
15+
from model2vec.quantization import DType, quantize_and_reduce_dim
16+
from model2vec.utils import ProgressParallel
1717

1818
PathLike = Union[Path, str]
1919

@@ -174,6 +174,7 @@ def from_pretrained(
174174
quantize_to: str | DType | None = None,
175175
dimensionality: int | None = None,
176176
vocabulary_quantization: int | None = None,
177+
force_download: bool = True,
177178
) -> StaticModel:
178179
"""
179180
Load a StaticModel from a local path or huggingface hub path.
@@ -190,6 +191,8 @@ def from_pretrained(
190191
This is useful if you want to load a model with a lower dimensionality.
191192
Note that this only applies if you have trained your model using mrl or PCA.
192193
:param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
194+
:param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
195+
already present in the cache.
193196
:return: A StaticModel.
194197
"""
195198
return _loading_helper(
@@ -202,6 +205,7 @@ def from_pretrained(
202205
from_sentence_transformers=False,
203206
normalize=normalize,
204207
subfolder=subfolder,
208+
force_download=force_download,
205209
)
206210

207211
@classmethod
@@ -213,6 +217,7 @@ def from_sentence_transformers(
213217
quantize_to: str | DType | None = None,
214218
dimensionality: int | None = None,
215219
vocabulary_quantization: int | None = None,
220+
force_download: bool = True,
216221
) -> StaticModel:
217222
"""
218223
Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
@@ -228,6 +233,8 @@ def from_sentence_transformers(
228233
This is useful if you want to load a model with a lower dimensionality.
229234
Note that this only applies if you have trained your model using mrl or PCA.
230235
:param vocabulary_quantization: The number of clusters to use for vocabulary quantization.
236+
:param force_download: Whether to force the download of the model. If False, the model is only downloaded if it is not
237+
already present in the cache.
231238
:return: A StaticModel.
232239
"""
233240
return _loading_helper(
@@ -240,6 +247,7 @@ def from_sentence_transformers(
240247
from_sentence_transformers=True,
241248
normalize=normalize,
242249
subfolder=None,
250+
force_download=force_download,
243251
)
244252

245253
@overload
@@ -467,33 +475,6 @@ def push_to_hub(
467475
self.save_pretrained(temp_dir, model_name=repo_id)
468476
push_folder_to_hub(Path(temp_dir), subfolder=subfolder, repo_id=repo_id, private=private, token=token)
469477

470-
@classmethod
471-
def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
472-
"""
473-
Loads a model from a local path.
474-
475-
You should only use this code path if you are concerned with start-up time.
476-
Loading via the `from_pretrained` method is safer, and auto-downloads, but
477-
also means we import a whole bunch of huggingface code that we don't need.
478-
479-
Additionally, huggingface will check the most recent version of the model,
480-
which can be slow.
481-
482-
:param path: The path to load the model from. The path is a directory saved by the
483-
`save_pretrained` method.
484-
:return: A StaticModel
485-
:raises: ValueError if the path is not a directory.
486-
"""
487-
path = Path(path)
488-
if not path.is_dir():
489-
raise ValueError(f"Path {path} is not a directory.")
490-
491-
embeddings, tokenizer, config, weights, mapping = load_local_model(path)
492-
493-
return StaticModel(
494-
vectors=embeddings, tokenizer=tokenizer, config=config, weights=weights, token_mapping=mapping
495-
)
496-
497478

498479
def quantize_model(
499480
model: StaticModel,
@@ -552,12 +533,13 @@ def _loading_helper(
552533
cls: type[StaticModel],
553534
path: PathLike,
554535
token: str | None,
555-
vocabulary_quantization: int | None = None,
556-
quantize_to: str | DType | None = None,
557-
dimensionality: int | None = None,
558-
from_sentence_transformers: bool = False,
559-
normalize: bool | None = None,
560-
subfolder: str | None = None,
536+
vocabulary_quantization: int | None,
537+
quantize_to: str | DType | None,
538+
dimensionality: int | None,
539+
from_sentence_transformers: bool,
540+
normalize: bool | None,
541+
subfolder: str | None,
542+
force_download: bool,
561543
) -> StaticModel:
562544
"""Helper function to load a model from a directory."""
563545
from model2vec.hf_utils import load_pretrained
@@ -570,6 +552,7 @@ def _loading_helper(
570552
token=token,
571553
from_sentence_transformers=from_sentence_transformers,
572554
subfolder=subfolder,
555+
force_download=force_download,
573556
)
574557

575558
model = cls(

model2vec/train/base.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ def from_pretrained(
8181
return cls.from_static_model(model=model, out_dim=out_dim, **kwargs)
8282

8383
@classmethod
84-
def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, **kwargs: Any) -> ModelType:
84+
def from_static_model(
85+
cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, pad_token: str = "[PAD]", **kwargs: Any
86+
) -> ModelType:
8587
"""Load the model from a static model."""
8688
model.embedding = np.nan_to_num(model.embedding)
8789
weights = torch.from_numpy(model.weights) if model.weights is not None else None
@@ -92,7 +94,7 @@ def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int
9294
token_mapping = None
9395
return cls(
9496
vectors=embeddings_converted,
95-
pad_id=model.tokenizer.token_to_id("[PAD]"),
97+
pad_id=model.tokenizer.token_to_id(pad_token),
9698
out_dim=out_dim,
9799
tokenizer=model.tokenizer,
98100
token_mapping=token_mapping,

model2vec/utils.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -102,33 +102,3 @@ def setup_logging() -> None:
102102
datefmt="%Y-%m-%d %H:%M:%S",
103103
handlers=[RichHandler(rich_tracebacks=True)],
104104
)
105-
106-
107-
def load_local_model(
108-
folder: Path,
109-
) -> tuple[np.ndarray, Tokenizer, dict[str, str], np.ndarray | None, np.ndarray | None]:
110-
"""Load a local model."""
111-
embeddings_path = folder / "model.safetensors"
112-
tokenizer_path = folder / "tokenizer.json"
113-
config_path = folder / "config.json"
114-
115-
opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
116-
embeddings = opened_tensor_file.get_tensor("embeddings")
117-
try:
118-
weights = opened_tensor_file.get_tensor("weights")
119-
except Exception:
120-
# Bare except because safetensors does not export its own errors.
121-
weights = None
122-
try:
123-
mapping = opened_tensor_file.get_tensor("mapping")
124-
except Exception:
125-
mapping = None
126-
127-
if config_path.exists():
128-
config = json.load(open(config_path))
129-
else:
130-
config = {}
131-
132-
tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
133-
134-
return embeddings, tokenizer, config, weights, mapping

tests/test_model.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ def test_encode_as_tokens_empty(
119119
encoded = model.encode_as_sequence("")
120120
assert np.array_equal(encoded, np.zeros(shape=(0, 2), dtype=model.embedding.dtype))
121121

122-
encoded = model.encode_as_sequence(["", ""])
122+
encoded_list = model.encode_as_sequence(["", ""])
123123
out = [np.zeros(shape=(0, 2), dtype=model.embedding.dtype) for _ in range(2)]
124-
assert [np.array_equal(x, y) for x, y in zip(encoded, out)]
124+
assert [np.array_equal(x, y) for x, y in zip(encoded_list, out)]
125125

126126

127127
def test_encode_empty_sentence(
@@ -298,23 +298,3 @@ def test_dim(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer, mock_config: d
298298
model = StaticModel(mock_vectors, mock_tokenizer, mock_config)
299299
assert model.dim == 2
300300
assert model.dim == model.embedding.shape[1]
301-
302-
303-
def test_local_load_from_model(mock_tokenizer: Tokenizer) -> None:
304-
"""Test local load from a model."""
305-
x = np.ones((mock_tokenizer.get_vocab_size(), 2))
306-
with TemporaryDirectory() as tempdir:
307-
tempdir_path = Path(tempdir)
308-
safetensors.numpy.save_file({"embeddings": x}, Path(tempdir) / "model.safetensors")
309-
mock_tokenizer.save(str(Path(tempdir) / "tokenizer.json"))
310-
311-
model = StaticModel.load_local(tempdir_path)
312-
assert model.embedding.shape == x.shape
313-
assert model.tokenizer.to_str() == mock_tokenizer.to_str()
314-
assert model.config == {"normalize": False}
315-
316-
317-
def test_local_load_from_model_no_folder() -> None:
318-
"""Test local load from a model with no folder."""
319-
with pytest.raises(ValueError):
320-
StaticModel.load_local("woahbuddy_relax_this_is_just_a_test")

0 commit comments

Comments
 (0)