Skip to content

Commit 6c7a40d

Browse files
authored
fix: PLT-809: Fix json_min memory usage (#483)
1 parent 2029cf3 commit 6c7a40d

File tree

1 file changed

+48
-22
lines changed

1 file changed

+48
-22
lines changed

src/label_studio_sdk/converter/converter.py

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -582,40 +582,66 @@ def convert_to_json(self, input_data, output_dir, is_dir=True):
582582
self._check_format(Format.JSON)
583583
ensure_dir(output_dir)
584584
output_file = os.path.join(output_dir, "result.json")
585-
records = []
585+
586586
if is_dir:
587-
for json_file in glob(os.path.join(input_data, "*.json")):
588-
with io.open(json_file, encoding="utf8") as f:
589-
records.append(json.load(f))
587+
# Memory-optimized: stream JSON writing instead of accumulating in memory
590588
with io.open(output_file, mode="w", encoding="utf8") as fout:
591-
json.dump(records, fout, indent=2, ensure_ascii=False)
589+
fout.write("[\n")
590+
first_record = True
591+
592+
for json_file in glob(os.path.join(input_data, "*.json")):
593+
with io.open(json_file, encoding="utf8") as f:
594+
record = json.load(f)
595+
596+
if not first_record:
597+
fout.write(",\n")
598+
json.dump(record, fout, indent=2, ensure_ascii=False)
599+
first_record = False
600+
601+
# Free memory immediately
602+
del record
603+
604+
fout.write("\n]")
592605
else:
593606
copy2(input_data, output_file)
594607

595608
def convert_to_json_min(self, input_data, output_dir, is_dir=True):
596609
self._check_format(Format.JSON_MIN)
597610
ensure_dir(output_dir)
598611
output_file = os.path.join(output_dir, "result.json")
599-
records = []
600612
item_iterator = self.iter_from_dir if is_dir else self.iter_from_json_file
601613

602-
for item in item_iterator(input_data):
603-
record = deepcopy(item["input"])
604-
if item.get("id") is not None:
605-
record["id"] = item["id"]
606-
for name, value in item["output"].items():
607-
record[name] = prettify_result(value)
608-
record["annotator"] = get_annotator(item, int_id=True)
609-
record["annotation_id"] = item["annotation_id"]
610-
record["created_at"] = item["created_at"]
611-
record["updated_at"] = item["updated_at"]
612-
record["lead_time"] = item["lead_time"]
613-
if "agreement" in item:
614-
record["agreement"] = item["agreement"]
615-
records.append(record)
616-
617614
with io.open(output_file, mode="w", encoding="utf8") as fout:
618-
json.dump(records, fout, indent=2, ensure_ascii=False)
615+
fout.write("[\n")
616+
first_record = True
617+
618+
for item in item_iterator(input_data):
619+
# SAFE memory optimization: use json serialization/deserialization
620+
# This avoids deepcopy but ensures complete isolation of objects
621+
record = json.loads(json.dumps(item["input"]))
622+
623+
if item.get("id") is not None:
624+
record["id"] = item["id"]
625+
for name, value in item["output"].items():
626+
record[name] = prettify_result(value)
627+
record["annotator"] = get_annotator(item, int_id=True)
628+
record["annotation_id"] = item["annotation_id"]
629+
record["created_at"] = item["created_at"]
630+
record["updated_at"] = item["updated_at"]
631+
record["lead_time"] = item["lead_time"]
632+
if "agreement" in item:
633+
record["agreement"] = item["agreement"]
634+
635+
# Write record to file immediately
636+
if not first_record:
637+
fout.write(",\n")
638+
json.dump(record, fout, indent=2, ensure_ascii=False)
639+
first_record = False
640+
641+
# Explicitly delete record to free memory
642+
del record
643+
644+
fout.write("\n]")
619645

620646
def convert_to_csv(self, input_data, output_dir, is_dir=True, **kwargs):
621647
self._check_format(Format.CSV)

0 commit comments

Comments
 (0)