Skip to content

Drop support for EOL Python 2 #732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ cache:
- /home/travis/download
- /home/travis/.cache/pip

# This matrix tests that the code works on Python 2.7,
# 3.5, 3.6 (same versions as PyTorch CI), and passes lint.
# This matrix tests that the code works on Python 3.5,
# 3.6 (same versions as PyTorch CI), and passes lint.
matrix:
fast_finish: true
include:
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Installation
============


Make sure you have Python 2.7 or 3.5+ and PyTorch 0.4.0 or newer. You can then install torchtext using pip::
Make sure you have Python 3.5+ and PyTorch 0.4.0 or newer. You can then install torchtext using pip::

pip install torchtext

Expand Down
1 change: 0 additions & 1 deletion build_tools/conda/torchtext/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ requirements:
- numpy >=1.11
- pytorch >=1.2
- requests
- six

build:
number: 1
Expand Down
3 changes: 0 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ tqdm
# Downloading data and other files
requests

# Legacy
six

# Optional NLP tools
nltk
spacy
Expand Down
11 changes: 10 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,16 @@ def find_version(*file_paths):
license='BSD',

install_requires=[
'tqdm', 'requests', 'torch', 'numpy', 'six', 'sentencepiece'
'tqdm', 'requests', 'torch', 'numpy', 'sentencepiece'
],
python_requires='>=3.5',
classifiers=[
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3 :: Only',
],

# Package info
Expand Down
1 change: 0 additions & 1 deletion test/data/test_batch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from __future__ import unicode_literals
import torch
import torchtext.data as data

Expand Down
4 changes: 1 addition & 3 deletions test/data/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import torchtext.data as data
import tempfile
import six

import pytest

Expand Down Expand Up @@ -246,7 +244,7 @@ def test_csv_dataset_quotechar(self):

with tempfile.NamedTemporaryFile(dir=self.test_dir) as f:
for example in example_data:
f.write(six.b("{}\n".format(",".join(example))))
f.write("{}\n".format(",".join(example)).encode("latin-1"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is the need for encode coming from?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In particular latin-1 vs. UTF-8?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the six documentation:

six.b(data)

A “fake” bytes literal. data should always be a normal string literal. ... In Python 3, data is encoded with the latin-1 encoding to bytes.

https://six.readthedocs.io/#six.b

From the six code:

if PY3:
    def b(s):
        return s.encode("latin-1")


TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
fields = {
Expand Down
1 change: 0 additions & 1 deletion test/data/test_field.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from collections import Counter
import os

Expand Down
16 changes: 4 additions & 12 deletions test/data/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
sentencepiece_numericalizer, sentencepiece_tokenizer, \
custom_replace, simple_space_split
import os
import sys


class TestFunctional(TorchtextTestCase):
Expand Down Expand Up @@ -47,17 +46,10 @@ def test_sentencepiece_tokenizer(self):
self.assertEqual(len(sp_model), 20000)
spm_generator = sentencepiece_tokenizer(sp_model)

# Handle byte string in Python2 and Unicode string in Python3, respectively
if sys.version_info < (3, 0):
ref_results = ['\xe2\x96\x81Sent', 'ence', 'P', 'ie', 'ce', '\xe2\x96\x81is',
'\xe2\x96\x81an', '\xe2\x96\x81un', 'super', 'vis', 'ed',
'\xe2\x96\x81text', '\xe2\x96\x81to', 'ken', 'izer',
'\xe2\x96\x81and', '\xe2\x96\x81de', 'to', 'ken', 'izer']
else:
ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
'\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
'\u2581to', 'ken', 'izer', '\u2581and',
'\u2581de', 'to', 'ken', 'izer']
ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
'\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
'\u2581to', 'ken', 'izer', '\u2581and',
'\u2581de', 'to', 'ken', 'izer']

self.assertEqual(list(spm_generator([test_sample]))[0],
ref_results)
Expand Down
10 changes: 4 additions & 6 deletions test/data/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import six
import torchtext.data as data

from ..common.torchtext_test_case import TorchtextTestCase
Expand All @@ -20,7 +18,7 @@ def test_pipeline(self):
assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]

pipeline = data.Pipeline(six.text_type.lower)
pipeline = data.Pipeline(str.lower)
assert pipeline("Test STring") == "test string"
assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
assert pipeline(["1241", "Some String"]) == ["1241", "some string"]
Expand All @@ -34,10 +32,10 @@ def test_composition(self):
pipeline = data.Pipeline(TestPipeline.repeat_n)
pipeline.add_before(id_pipeline)
pipeline.add_after(id_pipeline)
pipeline.add_before(six.text_type.lower)
pipeline.add_after(six.text_type.capitalize)
pipeline.add_before(str.lower)
pipeline.add_after(str.capitalize)

other_pipeline = data.Pipeline(six.text_type.swapcase)
other_pipeline = data.Pipeline(str.swapcase)
other_pipeline.add_before(pipeline)

# Assert pipeline gives proper results after composition
Expand Down
4 changes: 0 additions & 4 deletions test/data/test_subword.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import unittest
import pytest
import sys

from torchtext import data
from torchtext.datasets import TREC


class TestSubword(unittest.TestCase):
@pytest.mark.skipif(sys.version_info < (3, 0),
reason="revtok currently breaks for python 2.7")
def test_subword_trec(self):
TEXT = data.SubwordField()
LABEL = data.Field(sequential=False)
Expand Down
5 changes: 2 additions & 3 deletions test/data/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import six
import torchtext.data as data
import pytest
from ..common.torchtext_test_case import TorchtextTestCase
Expand All @@ -16,7 +15,7 @@ def test_get_tokenizer_split(self):

def test_get_tokenizer_spacy(self):
# Test SpaCy option, and verify it properly handles punctuation.
assert data.get_tokenizer("spacy")(six.text_type(self.TEST_STR)) == [
assert data.get_tokenizer("spacy")(str(self.TEST_STR)) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

Expand All @@ -33,7 +32,7 @@ def test_get_tokenizer_moses(self):
"complex", "punctuation", "."]

# Nonbreaking prefixes should tokenize the final period.
assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]
assert moses_tokenizer("abc def.") == ["abc", "def", "."]

def test_get_tokenizer_toktokt(self):
# Test Toktok option. Test strings taken from NLTK doctests.
Expand Down
3 changes: 1 addition & 2 deletions test/test_vocab.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from collections import Counter
import os
import pickle
Expand Down Expand Up @@ -101,7 +100,7 @@ def test_vocab_download_fasttext_vectors(self):
# to test string aliases.
for i in range(3):
if i == 2:
vectors = str("fasttext.simple.300d") # must handle str on Py2
vectors = "fasttext.simple.300d"
else:
vectors = FastText(language='simple')

Expand Down
3 changes: 1 addition & 2 deletions torchtext/data/example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import six
import json
from functools import reduce

Expand Down Expand Up @@ -75,7 +74,7 @@ def fromlist(cls, data, fields):
ex = cls()
for (name, field), val in zip(fields, data):
if field is not None:
if isinstance(val, six.string_types):
if isinstance(val, str):
val = val.rstrip('\n')
# Handle field tuples
if isinstance(name, tuple):
Expand Down
13 changes: 4 additions & 9 deletions torchtext/data/field.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# coding: utf8
from collections import Counter, OrderedDict
from itertools import chain
import six
import torch
from tqdm import tqdm

Expand Down Expand Up @@ -204,17 +203,13 @@ def __eq__(self, other):
def preprocess(self, x):
"""Load a single example using this field, tokenizing if necessary.

If the input is a Python 2 `str`, it will be converted to Unicode
first. If `sequential=True`, it will be tokenized. Then the input
If `sequential=True`, the input will be tokenized. Then the input
will be optionally lowercased and passed to the user-provided
`preprocessing` Pipeline."""
if (six.PY2 and isinstance(x, six.string_types)
and not isinstance(x, six.text_type)):
x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
if self.sequential and isinstance(x, six.text_type):
if self.sequential and isinstance(x, str):
x = self.tokenize(x.rstrip('\n'))
if self.lower:
x = Pipeline(six.text_type.lower)(x)
x = Pipeline(str.lower)(x)
if self.sequential and self.use_vocab and self.stop_words is not None:
x = [w for w in x if w not in self.stop_words]
if self.preprocessing is not None:
Expand Down Expand Up @@ -351,7 +346,7 @@ def numericalize(self, arr, device=None):
# the data is sequential, since it's unclear how to coerce padding tokens
# to a numeric type.
if not self.sequential:
arr = [numericalization_func(x) if isinstance(x, six.string_types)
arr = [numericalization_func(x) if isinstance(x, str)
else x for x in arr]
if self.postprocessing is not None:
arr = self.postprocessing(arr, None)
Expand Down
2 changes: 0 additions & 2 deletions torchtext/data/iterator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from __future__ import division

import math
import random

Expand Down
3 changes: 1 addition & 2 deletions torchtext/data/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
class Pipeline(object):
"""Defines a pipeline for transforming sequence data.

The input is assumed to be utf-8 encoded `str` (Python 3) or
`unicode` (Python 2).
The input is assumed to be utf-8 encoded `str`.

Attributes:
convert_token: The function to apply to input sequence data.
Expand Down
12 changes: 2 additions & 10 deletions torchtext/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import six
import requests
import csv
from tqdm import tqdm
Expand Down Expand Up @@ -133,15 +132,8 @@ def unicode_csv_reader(unicode_csv_data, **kwargs):
maxInt = int(maxInt / 10)
csv.field_size_limit(maxInt)

if six.PY2:
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [cell.decode('utf-8') for cell in row]
else:
for line in csv.reader(unicode_csv_data, **kwargs):
yield line
for line in csv.reader(unicode_csv_data, **kwargs):
yield line


def utf_8_encoder(unicode_csv_data):
Expand Down
12 changes: 3 additions & 9 deletions torchtext/vocab.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from __future__ import unicode_literals
from collections import defaultdict
from functools import partial
import logging
import os
import zipfile
import gzip

import six
from six.moves.urllib.request import urlretrieve
from urllib.request import urlretrieve
import torch
from tqdm import tqdm
import tarfile
Expand Down Expand Up @@ -169,9 +167,7 @@ def load_vectors(self, vectors, **kwargs):
if not isinstance(vectors, list):
vectors = [vectors]
for idx, vector in enumerate(vectors):
if six.PY2 and isinstance(vector, str):
vector = six.text_type(vector)
if isinstance(vector, six.string_types):
if isinstance(vector, str):
# Convert the string pretrained vector identifier
# to a Vectors object
if vector not in pretrained_aliases:
Expand Down Expand Up @@ -406,7 +402,7 @@ def cache(self, name, cache, url=None, max_vectors=None):
dim))

try:
if isinstance(word, six.binary_type):
if isinstance(word, bytes):
word = word.decode('utf-8')
except UnicodeDecodeError:
logger.info("Skipping non-UTF8 token {}".format(repr(word)))
Expand Down Expand Up @@ -507,8 +503,6 @@ def __getitem__(self, token):
vector = torch.Tensor(1, self.dim).zero_()
if token == "<unk>":
return self.unk_init(vector)
# These literals need to be coerced to unicode for Python 2 compatibility
# when we try to join them with read ngrams from the files.
chars = ['#BEGIN#'] + list(token) + ['#END#']
num_vectors = 0
for n in [2, 3, 4]:
Expand Down