@@ -582,40 +582,66 @@ def convert_to_json(self, input_data, output_dir, is_dir=True):
582
582
self ._check_format (Format .JSON )
583
583
ensure_dir (output_dir )
584
584
output_file = os .path .join (output_dir , "result.json" )
585
- records = []
585
+
586
586
if is_dir :
587
- for json_file in glob (os .path .join (input_data , "*.json" )):
588
- with io .open (json_file , encoding = "utf8" ) as f :
589
- records .append (json .load (f ))
587
+ # Memory-optimized: stream JSON writing instead of accumulating in memory
590
588
with io .open (output_file , mode = "w" , encoding = "utf8" ) as fout :
591
- json .dump (records , fout , indent = 2 , ensure_ascii = False )
589
+ fout .write ("[\n " )
590
+ first_record = True
591
+
592
+ for json_file in glob (os .path .join (input_data , "*.json" )):
593
+ with io .open (json_file , encoding = "utf8" ) as f :
594
+ record = json .load (f )
595
+
596
+ if not first_record :
597
+ fout .write (",\n " )
598
+ json .dump (record , fout , indent = 2 , ensure_ascii = False )
599
+ first_record = False
600
+
601
+ # Free memory immediately
602
+ del record
603
+
604
+ fout .write ("\n ]" )
592
605
else :
593
606
copy2 (input_data , output_file )
594
607
595
608
def convert_to_json_min (self , input_data , output_dir , is_dir = True ):
596
609
self ._check_format (Format .JSON_MIN )
597
610
ensure_dir (output_dir )
598
611
output_file = os .path .join (output_dir , "result.json" )
599
- records = []
600
612
item_iterator = self .iter_from_dir if is_dir else self .iter_from_json_file
601
613
602
- for item in item_iterator (input_data ):
603
- record = deepcopy (item ["input" ])
604
- if item .get ("id" ) is not None :
605
- record ["id" ] = item ["id" ]
606
- for name , value in item ["output" ].items ():
607
- record [name ] = prettify_result (value )
608
- record ["annotator" ] = get_annotator (item , int_id = True )
609
- record ["annotation_id" ] = item ["annotation_id" ]
610
- record ["created_at" ] = item ["created_at" ]
611
- record ["updated_at" ] = item ["updated_at" ]
612
- record ["lead_time" ] = item ["lead_time" ]
613
- if "agreement" in item :
614
- record ["agreement" ] = item ["agreement" ]
615
- records .append (record )
616
-
617
614
with io .open (output_file , mode = "w" , encoding = "utf8" ) as fout :
618
- json .dump (records , fout , indent = 2 , ensure_ascii = False )
615
+ fout .write ("[\n " )
616
+ first_record = True
617
+
618
+ for item in item_iterator (input_data ):
619
+ # SAFE memory optimization: use json serialization/deserialization
620
+ # This avoids deepcopy but ensures complete isolation of objects
621
+ record = json .loads (json .dumps (item ["input" ]))
622
+
623
+ if item .get ("id" ) is not None :
624
+ record ["id" ] = item ["id" ]
625
+ for name , value in item ["output" ].items ():
626
+ record [name ] = prettify_result (value )
627
+ record ["annotator" ] = get_annotator (item , int_id = True )
628
+ record ["annotation_id" ] = item ["annotation_id" ]
629
+ record ["created_at" ] = item ["created_at" ]
630
+ record ["updated_at" ] = item ["updated_at" ]
631
+ record ["lead_time" ] = item ["lead_time" ]
632
+ if "agreement" in item :
633
+ record ["agreement" ] = item ["agreement" ]
634
+
635
+ # Write record to file immediately
636
+ if not first_record :
637
+ fout .write (",\n " )
638
+ json .dump (record , fout , indent = 2 , ensure_ascii = False )
639
+ first_record = False
640
+
641
+ # Explicitly delete record to free memory
642
+ del record
643
+
644
+ fout .write ("\n ]" )
619
645
620
646
def convert_to_csv (self , input_data , output_dir , is_dir = True , ** kwargs ):
621
647
self ._check_format (Format .CSV )
0 commit comments