|
1 | 1 | import datasets
|
2 | 2 |
|
3 | 3 |
|
| 4 | +# Subject boundaries for MMLU-SR answer_only configuration |
| 5 | +# Based on empirical analysis of the dataset structure |
| 6 | +SUBJECT_BOUNDARIES = { |
| 7 | + "anatomy": (100, 228), # 129 questions |
| 8 | + # Add other subjects as needed |
| 9 | +} |
| 10 | + |
| 11 | + |
| 12 | +def filter_by_subject(dataset: datasets.Dataset, subject: str) -> datasets.Dataset: |
| 13 | + """Filter dataset to only include questions for a specific subject.""" |
| 14 | + if subject not in SUBJECT_BOUNDARIES: |
| 15 | + raise ValueError(f"Unknown subject: {subject}. Available subjects: {list(SUBJECT_BOUNDARIES.keys())}") |
| 16 | + |
| 17 | + start_idx, end_idx = SUBJECT_BOUNDARIES[subject] |
| 18 | + |
| 19 | + # Handle different split sizes - boundaries are for test split (13985 rows) |
| 20 | + # If this is a smaller split (like train), scale the boundaries proportionally |
| 21 | + dataset_size = len(dataset) |
| 22 | + if dataset_size != 13985: # Not the full test split |
| 23 | + # Scale boundaries proportionally |
| 24 | + scale_factor = dataset_size / 13985 |
| 25 | + start_idx = int(start_idx * scale_factor) |
| 26 | + end_idx = int(end_idx * scale_factor) |
| 27 | + |
| 28 | + # Ensure we don't go out of bounds |
| 29 | + end_idx = min(end_idx, dataset_size - 1) |
| 30 | + |
| 31 | + if start_idx >= dataset_size: |
| 32 | + # Return empty dataset if boundaries are completely out of range |
| 33 | + return dataset.select([]) |
| 34 | + |
| 35 | + return dataset.select(range(start_idx, end_idx + 1)) |
| 36 | + |
| 37 | + |
4 | 38 | def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
|
5 | 39 | def _helper(doc):
|
6 |
| - # Assuming that the 'answer' field in the dataset now contains numbers 0-3 instead of 'A', 'B', 'C', 'D' |
7 |
| - answer_list = ["A", "B", "C", "D"] |
8 |
| - # Convert numeric index to corresponding letter |
9 |
| - answer_index = int(doc["answer"]) # Make sure the answer is an integer |
10 |
| - answer_letter = answer_list[answer_index] |
11 |
| - |
| 40 | + # Map generic column names to expected format |
| 41 | + # column_0: question, column_1-4: choices, column_5: answer |
12 | 42 | out_doc = {
|
13 |
| - "questions": doc["question"], |
14 |
| - "choices": [doc["choice1"], doc["choice2"], doc["choice3"], doc["choice4"]], |
15 |
| - "answer": answer_letter, # Include the letter for clarity |
| 43 | + "questions": doc["column_0"], |
| 44 | + "choices": [doc["column_1"], doc["column_2"], doc["column_3"], doc["column_4"]], |
| 45 | + "answer": doc["column_5"], # Already in letter format (A, B, C, D) |
16 | 46 | }
|
17 | 47 | return out_doc
|
18 | 48 |
|
19 | 49 | return dataset.map(_helper)
|
| 50 | + |
| 51 | + |
| 52 | +def doc_to_text(doc): |
| 53 | + return doc["questions"].strip() |
| 54 | + |
| 55 | + |
| 56 | +def doc_to_target(doc): |
| 57 | + return doc["answer"] |
| 58 | + |
| 59 | + |
| 60 | +def doc_to_choice(doc): |
| 61 | + return ["A", "B", "C", "D"] |
| 62 | + |
| 63 | + |
| 64 | +def process_docs_anatomy(dataset: datasets.Dataset) -> datasets.Dataset: |
| 65 | + """Process docs specifically for anatomy questions - filter and format.""" |
| 66 | + # First filter to get only anatomy questions |
| 67 | + filtered_dataset = filter_by_subject(dataset, "anatomy") |
| 68 | + |
| 69 | + # Then apply the standard processing |
| 70 | + def _helper(doc): |
| 71 | + # Map generic column names to expected format |
| 72 | + # column_0: question, column_1-4: choices, column_5: answer |
| 73 | + out_doc = { |
| 74 | + "questions": doc["column_0"], |
| 75 | + "choices": [doc["column_1"], doc["column_2"], doc["column_3"], doc["column_4"]], |
| 76 | + "answer": doc["column_5"], # Already in letter format (A, B, C, D) |
| 77 | + } |
| 78 | + return out_doc |
| 79 | + |
| 80 | + return filtered_dataset.map(_helper) |
0 commit comments