Skip to content
This repository was archived by the owner on Mar 20, 2026. It is now read-only.

Commit 630701e

Browse files
Myle Ottfacebook-github-bot
authored andcommitted
Add instructions for paraphrasing model (#1968)
Summary: Pull Request resolved: #1968 Reviewed By: ngoyal2707 Differential Revision: D20860682 Pulled By: myleott fbshipit-source-id: b7dced493410a4b9e217e4735eb9cdd0370ad47e
1 parent 5feb564 commit 630701e

3 files changed

Lines changed: 127 additions & 1 deletion

File tree

examples/paraphraser/README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Paraphrasing with round-trip translation and mixture of experts
2+
3+
Machine translation models can be used to paraphrase text by translating it to
4+
an intermediate language and back (round-trip translation).
5+
6+
This example shows how to paraphrase text by first passing it to an
7+
English-French translation model, followed by a French-English [mixture of
8+
experts translation model](/examples/translation_moe).
9+
10+
##### 0. Setup
11+
12+
Clone fairseq from source and install necessary dependencies:
13+
```bash
14+
git clone https://github.com/pytorch/fairseq.git
15+
cd fairseq
16+
pip install --editable .
17+
pip install sacremoses sentencepiece
18+
```
19+
20+
##### 1. Download models
21+
```bash
22+
wget https://dl.fbaipublicfiles.com/fairseq/models/paraphraser.en-fr.tar.gz
23+
wget https://dl.fbaipublicfiles.com/fairseq/models/paraphraser.fr-en.hMoEup.tar.gz
24+
tar -xzvf paraphraser.en-fr.tar.gz
25+
tar -xzvf paraphraser.fr-en.hMoEup.tar.gz
26+
```
27+
28+
##### 2. Paraphrase
29+
```bash
30+
python examples/paraphraser/paraphrase.py \
31+
--en2fr paraphraser.en-fr \
32+
--fr2en paraphraser.fr-en.hMoEup
33+
# Example input:
34+
# The new date for the Games, postponed for a year in response to the coronavirus pandemic, gives athletes time to recalibrate their training schedules.
35+
# Example outputs:
36+
# Delayed one year in response to the coronavirus pandemic, the new date of the Games gives athletes time to rebalance their training schedule.
37+
# The new date of the Games, which was rescheduled one year in response to the coronavirus (CV) pandemic, gives athletes time to rebalance their training schedule.
38+
# The new date of the Games, postponed one year in response to the coronavirus pandemic, provides athletes with time to rebalance their training schedule.
39+
# The Games' new date, postponed one year in response to the coronavirus pandemic, gives athletes time to rebalance their training schedule.
40+
# The new Games date, postponed one year in response to the coronavirus pandemic, gives the athletes time to rebalance their training schedule.
41+
# The new date of the Games, which was postponed one year in response to the coronavirus pandemic, gives the athletes time to rebalance their training schedule.
42+
# The new date of the Games, postponed one year in response to the coronavirus pandemic, gives athletes time to rebalance their training schedule.
43+
# The new date of the Games, postponed one year in response to the coronavirus pandemic, gives athletes time to re-balance their training schedule.
44+
# The new date of the Games, postponed one year in response to the coronavirus pandemic, gives the athletes time to rebalance their schedule of training.
45+
# The new date of the Games, postponed one year in response to the pandemic of coronavirus, gives the athletes time to rebalance their training schedule.
46+
```

examples/paraphraser/paraphrase.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python3 -u
2+
3+
import argparse
4+
import fileinput
5+
import logging
6+
import os
7+
import sys
8+
9+
from fairseq.models.transformer import TransformerModel
10+
11+
12+
logging.getLogger().setLevel(logging.INFO)
13+
14+
15+
def main():
16+
parser = argparse.ArgumentParser(description='')
17+
parser.add_argument('--en2fr', required=True,
18+
help='path to en2fr model')
19+
parser.add_argument('--fr2en', required=True,
20+
help='path to fr2en mixture of experts model')
21+
parser.add_argument('--user-dir',
22+
help='path to fairseq examples/translation_moe/src directory')
23+
parser.add_argument('--num-experts', type=int, default=10,
24+
help='(keep at 10 unless using a different model)')
25+
parser.add_argument('files', nargs='*', default=['-'],
26+
help='input files to paraphrase; "-" for stdin')
27+
args = parser.parse_args()
28+
29+
if args.user_dir is None:
30+
args.user_dir = os.path.join(
31+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # examples/
32+
'translation_moe',
33+
'src',
34+
)
35+
if os.path.exists(args.user_dir):
36+
logging.info('found user_dir:' + args.user_dir)
37+
else:
38+
raise RuntimeError(
39+
'cannot find fairseq examples/translation_moe/src '
40+
'(tried looking here: {})'.format(args.user_dir)
41+
)
42+
43+
logging.info('loading en2fr model from:' + args.en2fr)
44+
en2fr = TransformerModel.from_pretrained(
45+
model_name_or_path=args.en2fr,
46+
tokenizer='moses',
47+
bpe='sentencepiece',
48+
).eval()
49+
50+
logging.info('loading fr2en model from:' + args.fr2en)
51+
fr2en = TransformerModel.from_pretrained(
52+
model_name_or_path=args.fr2en,
53+
tokenizer='moses',
54+
bpe='sentencepiece',
55+
user_dir=args.user_dir,
56+
task='translation_moe',
57+
).eval()
58+
59+
def gen_paraphrases(en):
60+
fr = en2fr.translate(en)
61+
return [
62+
fr2en.translate(fr, inference_step_args={'expert': i})
63+
for i in range(args.num_experts)
64+
]
65+
66+
logging.info('Type the input sentence and press return:')
67+
for line in fileinput.input(args.files):
68+
line = line.strip()
69+
if len(line) == 0:
70+
continue
71+
for paraphrase in gen_paraphrases(line):
72+
print(paraphrase)
73+
74+
75+
if __name__ == '__main__':
76+
main()

fairseq/hub_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ def generate(
145145
beam: int = 5,
146146
verbose: bool = False,
147147
skip_invalid_size_inputs=False,
148+
inference_step_args=None,
148149
**kwargs
149150
) -> List[List[Dict[str, torch.Tensor]]]:
150151
if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1:
@@ -159,10 +160,13 @@ def generate(
159160
setattr(gen_args, k, v)
160161
generator = self.task.build_generator(self.models, gen_args)
161162

163+
inference_step_args = inference_step_args or {}
162164
results = []
163165
for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs):
164166
batch = utils.apply_to_sample(lambda t: t.to(self.device), batch)
165-
translations = self.task.inference_step(generator, self.models, batch)
167+
translations = self.task.inference_step(
168+
generator, self.models, batch, **inference_step_args
169+
)
166170
for id, hypos in zip(batch["id"].tolist(), translations):
167171
results.append((id, hypos))
168172

0 commit comments

Comments
 (0)