Skip to content

Commit 7d92c10

Browse files
felipemello1Felipe Mello
andauthored
fix tag extraction (#2604)
Co-authored-by: Felipe Mello <[email protected]>
1 parent 9d00f14 commit 7d92c10

File tree

1 file changed

+11
-8
lines changed

1 file changed

+11
-8
lines changed

torchtune/dev/grpo/rewards.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,31 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from xml.etree import ElementTree as ET
7+
import re
8+
from typing import Dict
89

910
import torch
1011

1112
from torchtune.modules.transforms.tokenizers import ModelTokenizer
1213

1314

14-
def extract_tags(text: str) -> dict[str, list[str]]:
15+
def extract_tags(text: str) -> Dict:
1516
"""
1617
Parse XML-like tags from text. Returns a dictionary with keys 'think' and 'answer'.
1718
The values are lists of strings, with each string being the content of a tag.
1819
"""
19-
xml_string = f"<root>{text}</root>"
20-
root = ET.fromstring(xml_string)
21-
20+
think_pattern = r"<think>(.*?)</think>"
21+
answer_pattern = r"<answer>(.*?)</answer>"
22+
think_match = re.search(think_pattern, text, re.DOTALL)
23+
answer_match = re.search(answer_pattern, text, re.DOTALL)
24+
cot = think_match.group(1).strip() if think_match else ""
25+
potential_answer = answer_match.group(1).strip() if answer_match else ""
2226
return {
2327
"think": [
24-
elem.text if elem.text is not None else "" for elem in root.findall("think")
28+
cot,
2529
],
2630
"answer": [
27-
elem.text if elem.text is not None else ""
28-
for elem in root.findall("answer")
31+
potential_answer,
2932
],
3033
}
3134

0 commit comments

Comments
 (0)