6
6
import inflection
7
7
import re
8
8
import itertools
9
+ import os
10
+ import shutil
11
+
12
+ TARGET_REJECTED_DIR = os .getenv ("TARGET_REJECTED_DIR" )
9
13
10
14
logger = singer .get_logger ()
11
15
@@ -43,14 +47,6 @@ def column_clause(name, schema_property):
43
47
return '{} {}' .format (safe_column_name (name ), column_type (schema_property ))
44
48
45
49
46
- def sanitize (value ):
47
- if not isinstance (value , str ):
48
- return value
49
-
50
- # this sequence will cause the CSV load to fail
51
- return value .replace ("\\ u0000" , '' )
52
-
53
-
54
50
def flatten_key (k , parent_key , sep ):
55
51
full_key = parent_key + [k ]
56
52
inflected_key = [inflect_column_name (n ) for n in full_key ]
@@ -108,7 +104,10 @@ def flatten_record(d, parent_key=[], sep='__'):
108
104
109
105
110
106
def primary_column_names (stream_schema_message ):
111
- return [safe_column_name (inflect_column_name (p )) for p in stream_schema_message ['key_properties' ]]
107
+ return [
108
+ safe_column_name (inflect_column_name (p ))
109
+ for p in stream_schema_message ['key_properties' ]
110
+ ]
112
111
113
112
114
113
class DbSync :
@@ -117,6 +116,7 @@ def __init__(self, connection_config, stream_schema_message):
117
116
self .schema_name = self .connection_config ['schema' ]
118
117
self .stream_schema_message = stream_schema_message
119
118
self .flatten_schema = flatten_schema (stream_schema_message ['schema' ])
119
+ self .rejected_count = 0
120
120
121
121
def open_connection (self ):
122
122
conn_string = "host='{}' dbname='{}' user='{}' password='{}' port='{}'" .format (
@@ -153,6 +153,21 @@ def table_name(self, table_name, is_temporary):
153
153
else :
154
154
return '{}.{}' .format (self .schema_name , table_name )
155
155
156
+ def reject_file (self , file ):
157
+ self .rejected_count += 1
158
+
159
+ if not TARGET_REJECTED_DIR :
160
+ return
161
+
162
+ os .makedirs (TARGET_REJECTED_DIR , exist_ok = True )
163
+ rejected_file_name = "{}-{:04d}.rej.csv" .format (self .stream_schema_message ['stream' ],
164
+ self .rejected_count )
165
+ rejected_file_path = os .path .join (TARGET_REJECTED_DIR ,
166
+ rejected_file_name )
167
+
168
+ shutil .copy (file .name , rejected_file_path )
169
+ logger .info ("Saved rejected entries as {}" .format (rejected_file_path ))
170
+
156
171
def record_primary_key_string (self , record ):
157
172
if len (self .stream_schema_message ['key_properties' ]) == 0 :
158
173
return None
@@ -164,35 +179,39 @@ def record_to_csv_line(self, record):
164
179
flatten = flatten_record (record )
165
180
return ',' .join (
166
181
[
167
- json .dumps (sanitize ( flatten [name ]) ) if name in flatten and flatten [name ] else ''
182
+ json .dumps (flatten [name ]) if name in flatten and flatten [name ] else ''
168
183
for name in self .flatten_schema
169
184
]
170
185
)
171
186
172
187
def load_csv (self , file , count ):
173
- file .seek (0 )
174
- stream_schema_message = self .stream_schema_message
175
- stream = stream_schema_message ['stream' ]
176
- logger .info ("Loading {} rows into '{}'" .format (count , stream ))
177
-
178
- with self .open_connection () as connection :
179
- with connection .cursor (cursor_factory = psycopg2 .extras .DictCursor ) as cur :
180
- cur .execute (self .create_table_query (True ))
181
- copy_sql = "COPY {} ({}) FROM STDIN WITH (FORMAT CSV, ESCAPE '\\ ')" .format (
182
- self .table_name (stream , True ),
183
- ', ' .join (self .column_names ())
184
- )
185
- logger .info (copy_sql )
186
- cur .copy_expert (
187
- copy_sql ,
188
- file
189
- )
190
- if len (self .stream_schema_message ['key_properties' ]) > 0 :
191
- cur .execute (self .update_from_temp_table ())
188
+ try :
189
+ file .seek (0 )
190
+ stream_schema_message = self .stream_schema_message
191
+ stream = stream_schema_message ['stream' ]
192
+ logger .info ("Loading {} rows into '{}'" .format (count , stream ))
193
+
194
+ with self .open_connection () as connection :
195
+ with connection .cursor (cursor_factory = psycopg2 .extras .DictCursor ) as cur :
196
+ cur .execute (self .create_table_query (True ))
197
+ copy_sql = "COPY {} ({}) FROM STDIN WITH (FORMAT CSV, ESCAPE '\\ ')" .format (
198
+ self .table_name (stream , True ),
199
+ ', ' .join (self .column_names ())
200
+ )
201
+ logger .info (copy_sql )
202
+ cur .copy_expert (
203
+ copy_sql ,
204
+ file
205
+ )
206
+ if len (self .stream_schema_message ['key_properties' ]) > 0 :
207
+ cur .execute (self .update_from_temp_table ())
208
+ logger .info (cur .statusmessage )
209
+ cur .execute (self .insert_from_temp_table ())
192
210
logger .info (cur .statusmessage )
193
- cur .execute (self .insert_from_temp_table ())
194
- logger .info (cur .statusmessage )
195
- cur .execute (self .drop_temp_table ())
211
+ cur .execute (self .drop_temp_table ())
212
+ except psycopg2 .DataError as err :
213
+ logger .exception ("Failed to load CSV file: {}" .format (file .name ))
214
+ self .reject_file (file )
196
215
197
216
def insert_from_temp_table (self ):
198
217
stream_schema_message = self .stream_schema_message
0 commit comments