1010# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26
1111from frozendict .core import frozendict
1212from pathlib import Path
13- from typing import TextIO , Optional as O , Union , Any
13+ from typing import Optional as O , Union , Any
1414
1515from staging_service .import_specifications .file_parser import (
1616 PRIMITIVE_TYPE ,
3535_HEADER_REGEX = re .compile (f"{ _DATA_TYPE } (\\ w+){ _HEADER_SEP } "
3636 + f"{ _COLUMN_STR } (\\ d+){ _HEADER_SEP } { _VERSION_STR } (\\ d+)" )
3737
38- _MAGIC_TEXT_FILES = {"text/plain" , "inode/x-empty" }
38+ _MAGIC_TEXT_FILES = {"text/plain" , "inode/x-empty" , "application/csv" , "text/csv" }
3939
4040
4141class _ParseException (Exception ):
@@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version
6363 return match [1 ], int (match [2 ])
6464
6565
66- def _required_next (
67- input_ : Union [TextIO , Any ], # Any really means a csv reader object
68- spec_source : SpecificationSource ,
69- error : str
70- ) -> Union [str , list [str ]]:
71- # returns a string for a TextIO input or a list for a Reader input
72- try :
73- return next (input_ )
74- except StopIteration :
75- raise _ParseException (Error (ErrorType .PARSE_FAIL , error , spec_source ))
76-
7766def _csv_next (
78- input_ : Union [ TextIO , Any ] , # Any really means a csv reader object
67+ input_ : Any , # Any really means a csv reader object
7968 line_number : int ,
80- expected_line_count : int ,
69+ expected_line_count : Union [ None , int ], # None = skip columns check
8170 spec_source : SpecificationSource ,
8271 error : str
8372) -> list [str ]:
84- line = _required_next (input_ , spec_source , error )
85- if len (line ) != expected_line_count :
73+ try :
74+ line = next (input_ )
75+ except StopIteration :
76+ raise _ParseException (Error (ErrorType .PARSE_FAIL , error , spec_source ))
77+ if expected_line_count and len (line ) != expected_line_count :
8678 raise _ParseException (Error (
8779 ErrorType .INCORRECT_COLUMN_COUNT ,
8880 f"Incorrect number of items in line { line_number } , "
@@ -91,15 +83,6 @@ def _csv_next(
9183 return line
9284
9385
94- def _get_datatype (input_ : TextIO , spec_source : SpecificationSource , maximum_version : int
95- ) -> tuple [str , int ]:
96- # return is (data type, column count)
97- return _parse_header (
98- _required_next (input_ , spec_source , "Missing data type / version header" ).strip (),
99- spec_source ,
100- maximum_version )
101-
102-
10386def _error (error : Error ) -> ParseResults :
10487 return ParseResults (errors = tuple ([error ]))
10588
@@ -155,11 +138,13 @@ def _normalize_headers(
155138def _parse_xsv (path : Path , sep : str ) -> ParseResults :
156139 spcsrc = SpecificationSource (path )
157140 try :
158- if magic .from_file (str (path ), mime = True ) not in _MAGIC_TEXT_FILES :
159- return _error (Error (ErrorType .PARSE_FAIL , "Not a text file" , spcsrc ))
141+ filetype = magic .from_file (str (path ), mime = True )
142+ if filetype not in _MAGIC_TEXT_FILES :
143+ return _error (Error (ErrorType .PARSE_FAIL , "Not a text file: " + filetype , spcsrc ))
160144 with open (path , newline = '' ) as input_ :
161- datatype , columns = _get_datatype (input_ , spcsrc , _VERSION )
162145 rdr = csv .reader (input_ , delimiter = sep ) # let parser handle quoting
146+ dthd = _csv_next (rdr , 1 , None , spcsrc , "Missing data type / version header" )
147+ datatype , columns = _parse_header (dthd [0 ], spcsrc , _VERSION )
163148 hd1 = _csv_next (rdr , 2 , columns , spcsrc , "Missing 2nd header line" )
164149 param_ids = _normalize_headers (hd1 , 2 , spcsrc )
165150 _csv_next (rdr , 3 , columns , spcsrc , "Missing 3rd header line" )
0 commit comments