pytorch · zhangguanheng66 · Apr 24, 2020 · Apr 9, 2020 · Apr 14, 2020 · Apr 14, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -7,8 +7,8 @@ cache:
   - /home/travis/download
   - /home/travis/.cache/pip
 
-# This matrix tests that the code works on Python 2.7,
-# 3.5, 3.6 (same versions as PyTorch CI), and passes lint.
+# This matrix tests that the code works on Python 3.5,
+# 3.6 (same versions as PyTorch CI), and passes lint.
 matrix:
   fast_finish: true
   include:

diff --git a/README.rst b/README.rst
@@ -22,7 +22,7 @@ Installation
 ============
 
 
-Make sure you have Python 2.7 or 3.5+ and PyTorch 0.4.0 or newer. You can then install torchtext using pip::
+Make sure you have Python 3.5+ and PyTorch 0.4.0 or newer. You can then install torchtext using pip::
 
     pip install torchtext
 

diff --git a/build_tools/conda/torchtext/meta.yaml b/build_tools/conda/torchtext/meta.yaml
@@ -16,7 +16,6 @@ requirements:
     - numpy >=1.11
     - pytorch >=1.2
     - requests
-    - six
 
 build:
   number: 1

diff --git a/requirements.txt b/requirements.txt
@@ -4,9 +4,6 @@ tqdm
 # Downloading data and other files
 requests
 
-# Legacy
-six
-
 # Optional NLP tools
 nltk
 spacy

diff --git a/setup.py b/setup.py
@@ -37,7 +37,16 @@ def find_version(*file_paths):
     license='BSD',
 
     install_requires=[
-        'tqdm', 'requests', 'torch', 'numpy', 'six', 'sentencepiece'
+        'tqdm', 'requests', 'torch', 'numpy', 'sentencepiece'
+    ],
+    python_requires='>=3.5',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3 :: Only',
     ],
 
     # Package info

diff --git a/test/data/test_batch.py b/test/data/test_batch.py
@@ -1,4 +1,3 @@
-from __future__ import unicode_literals
 import torch
 import torchtext.data as data
 

diff --git a/test/data/test_dataset.py b/test/data/test_dataset.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
 import torchtext.data as data
 import tempfile
-import six
 
 import pytest
 
@@ -246,7 +244,7 @@ def test_csv_dataset_quotechar(self):
 
         with tempfile.NamedTemporaryFile(dir=self.test_dir) as f:
             for example in example_data:
-                f.write(six.b("{}\n".format(",".join(example))))
+                f.write("{}\n".format(",".join(example)).encode("latin-1"))
 
             TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
             fields = {

diff --git a/test/data/test_field.py b/test/data/test_field.py
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
 from collections import Counter
 import os
 

diff --git a/test/data/test_functional.py b/test/data/test_functional.py
@@ -4,7 +4,6 @@
     sentencepiece_numericalizer, sentencepiece_tokenizer, \
     custom_replace, simple_space_split
 import os
-import sys
 
 
 class TestFunctional(TorchtextTestCase):
@@ -47,17 +46,10 @@ def test_sentencepiece_tokenizer(self):
         self.assertEqual(len(sp_model), 20000)
         spm_generator = sentencepiece_tokenizer(sp_model)
 
-        # Handle byte string in Python2 and Unicode string in Python3, respectively
-        if sys.version_info < (3, 0):
-            ref_results = ['\xe2\x96\x81Sent', 'ence', 'P', 'ie', 'ce', '\xe2\x96\x81is',
-                           '\xe2\x96\x81an', '\xe2\x96\x81un', 'super', 'vis', 'ed',
-                           '\xe2\x96\x81text', '\xe2\x96\x81to', 'ken', 'izer',
-                           '\xe2\x96\x81and', '\xe2\x96\x81de', 'to', 'ken', 'izer']
-        else:
-            ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
-                           '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
-                           '\u2581to', 'ken', 'izer', '\u2581and',
-                           '\u2581de', 'to', 'ken', 'izer']
+        ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
+                       '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
+                       '\u2581to', 'ken', 'izer', '\u2581and',
+                       '\u2581de', 'to', 'ken', 'izer']
 
         self.assertEqual(list(spm_generator([test_sample]))[0],
                          ref_results)

diff --git a/test/data/test_pipeline.py b/test/data/test_pipeline.py
@@ -1,6 +1,4 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-import six
 import torchtext.data as data
 
 from ..common.torchtext_test_case import TorchtextTestCase
@@ -20,7 +18,7 @@ def test_pipeline(self):
         assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
         assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]
 
-        pipeline = data.Pipeline(six.text_type.lower)
+        pipeline = data.Pipeline(str.lower)
         assert pipeline("Test STring") == "test string"
         assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
         assert pipeline(["1241", "Some String"]) == ["1241", "some string"]
@@ -34,10 +32,10 @@ def test_composition(self):
         pipeline = data.Pipeline(TestPipeline.repeat_n)
         pipeline.add_before(id_pipeline)
         pipeline.add_after(id_pipeline)
-        pipeline.add_before(six.text_type.lower)
-        pipeline.add_after(six.text_type.capitalize)
+        pipeline.add_before(str.lower)
+        pipeline.add_after(str.capitalize)
 
-        other_pipeline = data.Pipeline(six.text_type.swapcase)
+        other_pipeline = data.Pipeline(str.swapcase)
         other_pipeline.add_before(pipeline)
 
         # Assert pipeline gives proper results after composition

diff --git a/test/data/test_subword.py b/test/data/test_subword.py
@@ -1,14 +1,10 @@
 import unittest
-import pytest
-import sys
 
 from torchtext import data
 from torchtext.datasets import TREC
 
 
 class TestSubword(unittest.TestCase):
-    @pytest.mark.skipif(sys.version_info < (3, 0),
-                        reason="revtok currently breaks for python 2.7")
     def test_subword_trec(self):
         TEXT = data.SubwordField()
         LABEL = data.Field(sequential=False)

diff --git a/test/data/test_utils.py b/test/data/test_utils.py
@@ -1,4 +1,3 @@
-import six
 import torchtext.data as data
 import pytest
 from ..common.torchtext_test_case import TorchtextTestCase
@@ -16,7 +15,7 @@ def test_get_tokenizer_split(self):
 
     def test_get_tokenizer_spacy(self):
         # Test SpaCy option, and verify it properly handles punctuation.
-        assert data.get_tokenizer("spacy")(six.text_type(self.TEST_STR)) == [
+        assert data.get_tokenizer("spacy")(str(self.TEST_STR)) == [
             "A", "string", ",", "particularly", "one", "with", "slightly",
             "complex", "punctuation", "."]
 
@@ -33,7 +32,7 @@ def test_get_tokenizer_moses(self):
             "complex", "punctuation", "."]
 
         # Nonbreaking prefixes should tokenize the final period.
-        assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]
+        assert moses_tokenizer("abc def.") == ["abc", "def", "."]
 
     def test_get_tokenizer_toktokt(self):
         # Test Toktok option. Test strings taken from NLTK doctests.

diff --git a/test/test_vocab.py b/test/test_vocab.py
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-from __future__ import unicode_literals
 from collections import Counter
 import os
 import pickle
@@ -101,7 +100,7 @@ def test_vocab_download_fasttext_vectors(self):
         # to test string aliases.
         for i in range(3):
             if i == 2:
-                vectors = str("fasttext.simple.300d")  # must handle str on Py2
+                vectors = "fasttext.simple.300d"
             else:
                 vectors = FastText(language='simple')
 

diff --git a/torchtext/data/example.py b/torchtext/data/example.py
@@ -1,4 +1,3 @@
-import six
 import json
 from functools import reduce
 
@@ -75,7 +74,7 @@ def fromlist(cls, data, fields):
         ex = cls()
         for (name, field), val in zip(fields, data):
             if field is not None:
-                if isinstance(val, six.string_types):
+                if isinstance(val, str):
                     val = val.rstrip('\n')
                 # Handle field tuples
                 if isinstance(name, tuple):

diff --git a/torchtext/data/field.py b/torchtext/data/field.py
@@ -1,7 +1,6 @@
 # coding: utf8
 from collections import Counter, OrderedDict
 from itertools import chain
-import six
 import torch
 from tqdm import tqdm
 
@@ -204,17 +203,13 @@ def __eq__(self, other):
     def preprocess(self, x):
         """Load a single example using this field, tokenizing if necessary.
 
-        If the input is a Python 2 `str`, it will be converted to Unicode
-        first. If `sequential=True`, it will be tokenized. Then the input
+        If `sequential=True`, the input will be tokenized. Then the input
         will be optionally lowercased and passed to the user-provided
         `preprocessing` Pipeline."""
-        if (six.PY2 and isinstance(x, six.string_types)
-                and not isinstance(x, six.text_type)):
-            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
-        if self.sequential and isinstance(x, six.text_type):
+        if self.sequential and isinstance(x, str):
             x = self.tokenize(x.rstrip('\n'))
         if self.lower:
-            x = Pipeline(six.text_type.lower)(x)
+            x = Pipeline(str.lower)(x)
         if self.sequential and self.use_vocab and self.stop_words is not None:
             x = [w for w in x if w not in self.stop_words]
         if self.preprocessing is not None:
@@ -351,7 +346,7 @@ def numericalize(self, arr, device=None):
             # the data is sequential, since it's unclear how to coerce padding tokens
             # to a numeric type.
             if not self.sequential:
-                arr = [numericalization_func(x) if isinstance(x, six.string_types)
+                arr = [numericalization_func(x) if isinstance(x, str)
                        else x for x in arr]
             if self.postprocessing is not None:
                 arr = self.postprocessing(arr, None)

diff --git a/torchtext/data/iterator.py b/torchtext/data/iterator.py
@@ -1,5 +1,3 @@
-from __future__ import division
-
 import math
 import random
 

diff --git a/torchtext/data/pipeline.py b/torchtext/data/pipeline.py
@@ -1,8 +1,7 @@
 class Pipeline(object):
     """Defines a pipeline for transforming sequence data.
 
-    The input is assumed to be utf-8 encoded `str` (Python 3) or
-    `unicode` (Python 2).
+    The input is assumed to be utf-8 encoded `str`.
 
     Attributes:
         convert_token: The function to apply to input sequence data.

diff --git a/torchtext/utils.py b/torchtext/utils.py
@@ -1,4 +1,3 @@
-import six
 import requests
 import csv
 from tqdm import tqdm
@@ -133,15 +132,8 @@ def unicode_csv_reader(unicode_csv_data, **kwargs):
             maxInt = int(maxInt / 10)
     csv.field_size_limit(maxInt)
 
-    if six.PY2:
-        # csv.py doesn't do Unicode; encode temporarily as UTF-8:
-        csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
-        for row in csv_reader:
-            # decode UTF-8 back to Unicode, cell by cell:
-            yield [cell.decode('utf-8') for cell in row]
-    else:
-        for line in csv.reader(unicode_csv_data, **kwargs):
-            yield line
+    for line in csv.reader(unicode_csv_data, **kwargs):
+        yield line
 
 
 def utf_8_encoder(unicode_csv_data):

diff --git a/torchtext/vocab.py b/torchtext/vocab.py
@@ -1,13 +1,11 @@
-from __future__ import unicode_literals
 from collections import defaultdict
 from functools import partial
 import logging
 import os
 import zipfile
 import gzip
 
-import six
-from six.moves.urllib.request import urlretrieve
+from urllib.request import urlretrieve
 import torch
 from tqdm import tqdm
 import tarfile
@@ -169,9 +167,7 @@ def load_vectors(self, vectors, **kwargs):
         if not isinstance(vectors, list):
             vectors = [vectors]
         for idx, vector in enumerate(vectors):
-            if six.PY2 and isinstance(vector, str):
-                vector = six.text_type(vector)
-            if isinstance(vector, six.string_types):
+            if isinstance(vector, str):
                 # Convert the string pretrained vector identifier
                 # to a Vectors object
                 if vector not in pretrained_aliases:
@@ -406,7 +402,7 @@ def cache(self, name, cache, url=None, max_vectors=None):
                                                                     dim))
 
                     try:
-                        if isinstance(word, six.binary_type):
+                        if isinstance(word, bytes):
                             word = word.decode('utf-8')
                     except UnicodeDecodeError:
                         logger.info("Skipping non-UTF8 token {}".format(repr(word)))
@@ -507,8 +503,6 @@ def __getitem__(self, token):
         vector = torch.Tensor(1, self.dim).zero_()
         if token == "<unk>":
             return self.unk_init(vector)
-        # These literals need to be coerced to unicode for Python 2 compatibility
-        # when we try to join them with read ngrams from the files.
         chars = ['#BEGIN#'] + list(token) + ['#END#']
         num_vectors = 0
         for n in [2, 3, 4]: