Merge pull request #173 from kbase/develop

MrCreosote · web-flow · commit 795ea9279f16 · 2022-05-02T11:05:39.000-07:00
Release v1.3.4
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,10 @@
+### Version 1.3.4
+- Alter the behavior of the bulk specification file writers to return an error if the
+  input `types` parameter is empty.
+- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the
+  first header of a file had trailing separators. This occurs if a csv/tsv file is opened and
+  saved by Excel.
+
 ### Version 1.3.3
 - Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry for
   each empty line in the file.
diff --git a/staging_service/app.py b/staging_service/app.py
@@ -34,7 +34,7 @@
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 routes = web.RouteTableDef()
-VERSION = "1.3.3"
+VERSION = "1.3.4"
 
 _DATATYPE_MAPPINGS = None
 
@@ -59,7 +59,7 @@ async def importer_filetypes(request: web.Request) -> web.json_response:
     Returns the file types for the configured datatypes. The returned JSON contains two keys:
     * datatype_to_filetype, which maps import datatypes (like gff_genome) to their accepted
       filetypes (like [FASTA, GFF])
-    * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g. 
+    * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g.
       *.fa, *.fasta, *.fa.gz, etc.)
 
     This information is currently static over the life of the server.
@@ -197,7 +197,7 @@ async def write_bulk_specification(request: web.Request) -> web.json_response:
 def _createJSONErrorResponse(error_text: str, error_class=web.HTTPBadRequest):
     err = json.dumps({"error": error_text})
     return error_class(text=err, content_type=_APP_JSON)
-    
+
 
 @routes.get("/add-acl-concierge")
 async def add_acl_concierge(request: web.Request):
diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py
@@ -23,7 +23,7 @@
         entry in the list corresponds to a row in the resulting import specification,
         and the order of the list defines the order of the rows.
     Leave the `data` list empty to write an empty template.
-:returns: A mapping of the data types to the files to which they were written. 
+:returns: A mapping of the data types to the files to which they were written.
 """
 # note that we can't use an f string here to interpolate the variables below, e.g.
 # order_and_display, etc.
@@ -82,7 +82,7 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]):
         Leave the {_DATA} list empty to write an empty template.
     """
     if not types:
-        return
+        raise ImportSpecWriteException("At least one data type must be specified")
     for datatype in types:
         # replace this with jsonschema? don't worry about it for now
         _check_string(datatype, "A data type")
@@ -235,4 +235,4 @@ class ImportSpecWriteException(Exception):
     """
     An exception thrown when writing an import specification fails.
     """
-    pass
+    pass
diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py
@@ -10,7 +10,7 @@
 # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26
 from frozendict.core import frozendict
 from pathlib import Path
-from typing import TextIO, Optional as O, Union, Any
+from typing import Optional as O, Union, Any
 
 from staging_service.import_specifications.file_parser import (
     PRIMITIVE_TYPE,
@@ -35,7 +35,7 @@
 _HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} "
     + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)")
 
-_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty"}
+_MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"}
 
 
 class _ParseException(Exception):
@@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version
     return match[1], int(match[2])
 
 
-def _required_next(
-    input_: Union[TextIO, Any],  # Any really means a csv reader object
-    spec_source: SpecificationSource,
-    error: str
-) -> Union[str, list[str]]:
-    # returns a string for a TextIO input or a list for a Reader input
-    try:
-        return next(input_)
-    except StopIteration:
-        raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))
-
 def _csv_next(
-    input_: Union[TextIO, Any],  # Any really means a csv reader object
+    input_: Any,  # Any really means a csv reader object
     line_number: int,
-    expected_line_count: int,
+    expected_line_count: Union[None, int],  # None = skip columns check
     spec_source: SpecificationSource,
     error: str
 ) -> list[str]:
-    line = _required_next(input_, spec_source, error)
-    if len(line) != expected_line_count:
+    try:
+        line = next(input_)
+    except StopIteration:
+        raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))
+    if expected_line_count and len(line) != expected_line_count:
         raise _ParseException(Error(
             ErrorType.INCORRECT_COLUMN_COUNT,
             f"Incorrect number of items in line {line_number}, "
@@ -91,15 +83,6 @@ def _csv_next(
     return line
 
 
-def _get_datatype(input_: TextIO, spec_source: SpecificationSource, maximum_version: int
-) -> tuple[str, int]:
-    # return is (data type, column count)
-    return _parse_header(
-        _required_next(input_, spec_source, "Missing data type / version header").strip(),
-        spec_source,
-        maximum_version)
-
-
 def _error(error: Error) -> ParseResults:
     return ParseResults(errors = tuple([error]))
 
@@ -155,11 +138,13 @@ def _normalize_headers(
 def _parse_xsv(path: Path, sep: str) -> ParseResults:
     spcsrc = SpecificationSource(path)
     try:
-        if magic.from_file(str(path), mime=True) not in _MAGIC_TEXT_FILES:
-            return _error(Error(ErrorType.PARSE_FAIL, "Not a text file", spcsrc))
+        filetype = magic.from_file(str(path), mime=True)
+        if filetype not in _MAGIC_TEXT_FILES:
+            return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc))
         with open(path, newline='') as input_:
-            datatype, columns = _get_datatype(input_, spcsrc, _VERSION)
             rdr = csv.reader(input_, delimiter=sep)  # let parser handle quoting
+            dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header")
+            datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION)
             hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line")
             param_ids = _normalize_headers(hd1, 2, spcsrc)
             _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line")
diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py
@@ -64,9 +64,6 @@ def temp_dir() -> Generator[Path, None, None]:
     }
 }
 
-def test_noop():
-    assert write_csv(Path("."), {}) == {}
-    assert write_tsv(Path("."), {}) == {}
 
 def test_write_csv(temp_dir: Path):
     res = write_csv(temp_dir, _TEST_DATA)
@@ -194,6 +191,7 @@ def test_file_writers_fail():
     E = ImportSpecWriteException
     file_writers_fail(None, {}, ValueError("The folder cannot be null"))
     file_writers_fail(p, None, E("The types value must be a mapping"))
+    file_writers_fail(p, {}, E("At least one data type must be specified"))
     file_writers_fail(
         p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string"))
     file_writers_fail(
@@ -281,4 +279,4 @@ def file_writers_fail(path: Path, types: dict, expected: Exception):
     assert_exception_correct(got.value, expected)
     with raises(Exception) as got:
         write_excel(path, types)
-    assert_exception_correct(got.value, expected)
+    assert_exception_correct(got.value, expected)
diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py
@@ -39,6 +39,9 @@ def temp_dir() -> Generator[Path, None, None]:
 
 
 def test_xsv_parse_success(temp_dir: Path):
+    # Tests a special case where if an xSV file is opened by Excel and then resaved, 
+    # Excel will add separators to the end of the 1st header line. This previously caused
+    # the parse to fail.
     _xsv_parse_success(temp_dir, ',', parse_csv)
     _xsv_parse_success(temp_dir, '\t', parse_tsv)
 
@@ -48,7 +51,7 @@ def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseR
     input_ = temp_dir / str(uuid.uuid4())
     with open(input_, "w") as test_file:
         test_file.writelines([
-            "Data type: some_type; Columns: 4; Version: 1\n",
+            f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n",
             f"spec1{s} spec2{s}   spec3   {s} spec4\n",    # test trimming
             f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n",
             f"val1 {s}   val2   {s}    7     {s} 3.2\n",   # test trimming
@@ -166,7 +169,10 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path):
     res = parse_csv(test_file)
 
     assert res == ParseResults(errors=tuple([
-        Error(ErrorType.PARSE_FAIL, "Not a text file", source_1=SpecificationSource(test_file))
+        Error(
+            ErrorType.PARSE_FAIL,
+            "Not a text file: application/vnd.ms-excel",
+            source_1=SpecificationSource(test_file))
     ]))