Skip to content

Commit 60c050e

Browse files
authored
Merge pull request #11448 from shadeMe/merge-develop-into-v4
Merge `develop` into `v4`
2 parents 4a615ca + 977b847 commit 60c050e

38 files changed

+752
-208
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ requires = [
66
"preshed>=3.0.2,<3.1.0",
77
"murmurhash>=0.28.0,<1.1.0",
88
"thinc>=8.1.0,<8.2.0",
9-
"pathy",
109
"numpy>=1.15.0",
1110
]
1211
build-backend = "setuptools.build_meta"

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Our libraries
2-
spacy-legacy>=3.0.9,<3.1.0
2+
spacy-legacy>=3.0.10,<3.1.0
33
spacy-loggers>=1.0.0,<2.0.0
44
cymem>=2.0.2,<2.1.0
55
preshed>=3.0.2,<3.1.0
@@ -34,4 +34,5 @@ mypy>=0.910,<0.970; platform_machine!='aarch64'
3434
types-dataclasses>=0.1.3; python_version < "3.7"
3535
types-mock>=0.1.1
3636
types-requests
37+
types-setuptools>=57.0.0
3738
black>=22.0,<23.0

setup.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ include_package_data = true
3333
python_requires = >=3.6
3434
install_requires =
3535
# Our libraries
36-
spacy-legacy>=3.0.9,<3.1.0
36+
spacy-legacy>=3.0.10,<3.1.0
3737
spacy-loggers>=1.0.0,<2.0.0
3838
murmurhash>=0.28.0,<1.1.0
3939
cymem>=2.0.2,<2.1.0
@@ -42,9 +42,9 @@ install_requires =
4242
wasabi>=0.9.1,<1.1.0
4343
srsly>=2.4.3,<3.0.0
4444
catalogue>=2.0.6,<2.1.0
45+
# Third-party dependencies
4546
typer>=0.3.0,<0.5.0
4647
pathy>=0.3.5
47-
# Third-party dependencies
4848
tqdm>=4.38.0,<5.0.0
4949
numpy>=1.15.0
5050
requests>=2.13.0,<3.0.0

spacy/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,21 +31,21 @@ def load(
3131
name: Union[str, Path],
3232
*,
3333
vocab: Union[Vocab, bool] = True,
34-
disable: Iterable[str] = util.SimpleFrozenList(),
35-
enable: Iterable[str] = util.SimpleFrozenList(),
36-
exclude: Iterable[str] = util.SimpleFrozenList(),
34+
disable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
35+
enable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
36+
exclude: Union[str, Iterable[str]] = util.SimpleFrozenList(),
3737
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
3838
) -> Language:
3939
"""Load a spaCy model from an installed package or a local path.
4040
4141
name (str): Package name or model path.
4242
vocab (Vocab): A Vocab object. If True, a vocab is created.
43-
disable (Iterable[str]): Names of pipeline components to disable. Disabled
43+
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
4444
pipes will be loaded but they won't be run unless you explicitly
4545
enable them by calling nlp.enable_pipe.
46-
enable (Iterable[str]): Names of pipeline components to enable. All other
46+
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
4747
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
48-
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
48+
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
4949
components won't be loaded.
5050
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
5151
keyed by section values in dot notation.

spacy/cli/download.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def download_cli(
2020
ctx: typer.Context,
2121
model: str = Arg(..., help="Name of pipeline package to download"),
2222
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
23-
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
23+
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
2424
# fmt: on
2525
):
2626
"""
@@ -36,7 +36,12 @@ def download_cli(
3636
download(model, direct, sdist, *ctx.args)
3737

3838

39-
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
39+
def download(
40+
model: str,
41+
direct: bool = False,
42+
sdist: bool = False,
43+
*pip_args,
44+
) -> None:
4045
if (
4146
not (is_package("spacy") or is_package("spacy-nightly"))
4247
and "--no-deps" not in pip_args
@@ -50,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
5055
"dependencies, you'll have to install them manually."
5156
)
5257
pip_args = pip_args + ("--no-deps",)
53-
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
54-
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
5558
if direct:
5659
components = model.split("-")
5760
model_name = "".join(components[:-1])
5861
version = components[-1]
59-
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
6062
else:
6163
model_name = model
6264
if model in OLD_MODEL_SHORTCUTS:
@@ -67,13 +69,26 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
6769
model_name = OLD_MODEL_SHORTCUTS[model]
6870
compatibility = get_compatibility()
6971
version = get_version(model_name, compatibility)
70-
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
72+
73+
filename = get_model_filename(model_name, version, sdist)
74+
75+
download_model(filename, pip_args)
7176
msg.good(
7277
"Download and installation successful",
7378
f"You can now load the package via spacy.load('{model_name}')",
7479
)
7580

7681

82+
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
83+
dl_tpl = "{m}-{v}/{m}-{v}{s}"
84+
egg_tpl = "#egg={m}=={v}"
85+
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
86+
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
87+
if sdist:
88+
filename += egg_tpl.format(m=model_name, v=version)
89+
return filename
90+
91+
7792
def get_compatibility() -> dict:
7893
if is_prerelease_version(about.__version__):
7994
version: Optional[str] = about.__version__
@@ -105,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
105120
return comp[model][0]
106121

107122

123+
def get_latest_version(model: str) -> str:
124+
comp = get_compatibility()
125+
return get_version(model, comp)
126+
127+
108128
def download_model(
109129
filename: str, user_pip_args: Optional[Sequence[str]] = None
110130
) -> None:

spacy/cli/info.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from typing import Optional, Dict, Any, Union, List
22
import platform
3+
import pkg_resources
4+
import json
35
from pathlib import Path
46
from wasabi import Printer, MarkdownRenderer
57
import srsly
68

79
from ._util import app, Arg, Opt, string_to_list
10+
from .download import get_model_filename, get_latest_version
811
from .. import util
912
from .. import about
1013

@@ -16,17 +19,27 @@ def info_cli(
1619
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
1720
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
1821
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
22+
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
1923
# fmt: on
2024
):
2125
"""
2226
Print info about spaCy installation. If a pipeline is specified as an argument,
2327
print its meta information. Flag --markdown prints details in Markdown for easy
2428
copy-pasting to GitHub issues.
2529
30+
Flag --url prints only the download URL of the most recent compatible
31+
version of the pipeline.
32+
2633
DOCS: https://spacy.io/api/cli#info
2734
"""
2835
exclude = string_to_list(exclude)
29-
info(model, markdown=markdown, silent=silent, exclude=exclude)
36+
info(
37+
model,
38+
markdown=markdown,
39+
silent=silent,
40+
exclude=exclude,
41+
url=url,
42+
)
3043

3144

3245
def info(
@@ -35,11 +48,20 @@ def info(
3548
markdown: bool = False,
3649
silent: bool = True,
3750
exclude: Optional[List[str]] = None,
51+
url: bool = False,
3852
) -> Union[str, dict]:
3953
msg = Printer(no_print=silent, pretty=not silent)
4054
if not exclude:
4155
exclude = []
42-
if model:
56+
if url:
57+
if model is not None:
58+
title = f"Download info for pipeline '{model}'"
59+
data = info_model_url(model)
60+
print(data["download_url"])
61+
return data
62+
else:
63+
msg.fail("--url option requires a pipeline name", exits=1)
64+
elif model:
4365
title = f"Info about pipeline '{model}'"
4466
data = info_model(model, silent=silent)
4567
else:
@@ -99,11 +121,43 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
99121
meta["source"] = str(model_path.resolve())
100122
else:
101123
meta["source"] = str(model_path)
124+
download_url = info_installed_model_url(model)
125+
if download_url:
126+
meta["download_url"] = download_url
102127
return {
103128
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
104129
}
105130

106131

132+
def info_installed_model_url(model: str) -> Optional[str]:
133+
"""Given a pipeline name, get the download URL if available, otherwise
134+
return None.
135+
136+
This is only available for pipelines installed as modules that have
137+
dist-info available.
138+
"""
139+
try:
140+
dist = pkg_resources.get_distribution(model)
141+
data = json.loads(dist.get_metadata("direct_url.json"))
142+
return data["url"]
143+
except pkg_resources.DistributionNotFound:
144+
# no such package
145+
return None
146+
except Exception:
147+
# something else, like no file or invalid JSON
148+
return None
149+
150+
def info_model_url(model: str) -> Dict[str, Any]:
151+
"""Return the download URL for the latest version of a pipeline."""
152+
version = get_latest_version(model)
153+
154+
filename = get_model_filename(model, version)
155+
download_url = about.__download_url__ + "/" + filename
156+
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
157+
release_url = release_tpl.format(m=model, v=version)
158+
return {"download_url": download_url, "release_url": release_url}
159+
160+
107161
def get_markdown(
108162
data: Dict[str, Any],
109163
title: Optional[str] = None,

spacy/errors.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,9 @@ class Errors(metaclass=ErrorsWithCodes):
230230
"initialized component.")
231231
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
232232
"exists. Existing factory: {func}. New factory: {new_func}")
233-
E005 = ("Pipeline component '{name}' returned None. If you're using a "
234-
"custom component, maybe you forgot to return the processed Doc?")
233+
E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
234+
"Doc. If you're using a custom component, maybe you forgot to "
235+
"return the processed Doc?")
235236
E006 = ("Invalid constraints for adding pipeline component. You can only "
236237
"set one of the following: before (component name or index), "
237238
"after (component name or index), first (True) or last (True). "

spacy/lang/ca/lemmatizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,10 @@ def rule_lemmatize(self, token: Token) -> List[str]:
7272
oov_forms.append(form)
7373
if not forms:
7474
forms.extend(oov_forms)
75-
if not forms and string in lookup_table.keys():
76-
forms.append(self.lookup_lemmatize(token)[0])
75+
76+
# use lookups, and fall back to the token itself
7777
if not forms:
78-
forms.append(string)
78+
forms.append(lookup_table.get(string, [string])[0])
7979
forms = list(dict.fromkeys(forms))
8080
self.cache[cache_key] = forms
8181
return forms

spacy/lang/fr/lemmatizer.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,16 @@ def rule_lemmatize(self, token: Token) -> List[str]:
5353
rules = rules_table.get(univ_pos, [])
5454
string = string.lower()
5555
forms = []
56+
# first try lookup in table based on upos
5657
if string in index:
5758
forms.append(string)
5859
self.cache[cache_key] = forms
5960
return forms
61+
62+
# then add anything in the exceptions table
6063
forms.extend(exceptions.get(string, []))
64+
65+
# if nothing found yet, use the rules
6166
oov_forms = []
6267
if not forms:
6368
for old, new in rules:
@@ -69,12 +74,14 @@ def rule_lemmatize(self, token: Token) -> List[str]:
6974
forms.append(form)
7075
else:
7176
oov_forms.append(form)
77+
78+
# if still nothing, add the oov forms from rules
7279
if not forms:
7380
forms.extend(oov_forms)
74-
if not forms and string in lookup_table.keys():
75-
forms.append(self.lookup_lemmatize(token)[0])
81+
82+
# use lookups, which fall back to the token itself
7683
if not forms:
77-
forms.append(string)
84+
forms.append(lookup_table.get(string, [string])[0])
7885
forms = list(dict.fromkeys(forms))
7986
self.cache[cache_key] = forms
8087
return forms

spacy/lang/la/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from ...language import Language, BaseDefaults
2+
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
3+
from .stop_words import STOP_WORDS
4+
from .lex_attrs import LEX_ATTRS
5+
6+
7+
class LatinDefaults(BaseDefaults):
8+
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
9+
stop_words = STOP_WORDS
10+
lex_attr_getters = LEX_ATTRS
11+
12+
13+
class Latin(Language):
14+
lang = "la"
15+
Defaults = LatinDefaults
16+
17+
18+
__all__ = ["Latin"]

0 commit comments

Comments
 (0)