Data Diversification: A Simple Strategy For Neural Machine Translation

Accepted as conference paper at 34th Conference on Neural Information Processing Systems (NeurIPS 2020), Vancouver, Canada, 2020

Authors: Xuan-Phi Nguyen, Shafiq Joty, Wu Kui, Ai Ti Aw

Github

Paper link: https://arxiv.org/abs/1911.01986

Citation

Please cite as:

@incollection{nguyen2020data,
title = {Data Diversification: A Simple Strategy For Neural Machine Translation},
author = {Xuan-Phi Nguyen and Shafiq Joty and Wu Kui and Ai Ti Aw},
booktitle = {Advances in Neural Information Processing Systems 32},
year = {2020},
publisher = {Curran Associates, Inc.},
}

Pretrained Models

Model Description Dataset Download
WMT'16 En-De Transformer WMT16 English-German model: download (.tar.gz)

Instruction To train WMT English-German

Step 1: Follow instruction from Fairseq to create the WMT’14 Dataset.

Save the processed data as data_fairseq/translate_ende_wmt16_bpe32k

Save the raw data (which contains the file train.tok.clean.bpe.32000.en) to raw_data/wmt_ende

Step 2: copy the same data to data_fairseq/translate_deen_wmt16_bpe32k for De-En

cp -r data_fairseq/translate_ende_wmt16_bpe32k data_fairseq/translate_deen_wmt16_bpe32k

Step 3: Train forward models. Step 3-4 can be done all in parallel, if you have more than 8 GPUs, you can run all 6 models at once.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export seed_prefix=100
export problem=translate_ende_wmt16_bpe32k
export model_name=big_tfm_baseline_df3584_s${seed_prefix}
export data_dir=`pwd`/data_fairseq/$problem

for index in {1..3}
do
export model_dir=train_fairseq/${problem}/${model_name}/model_${index}
fairseq-train \
    ${data_dir} \
    -s en -t de \
    --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-update 43000 \
    --keep-last-epochs 10 \
    --save-dir ${model_dir} \
    --ddp-backend no_c10d \
    --seed ${seed_prefix}${index} \
    --max-tokens 3584 \
    --fp16 --update-freq 16 --log-interval 10000 --no-progress-bar
done

Step 4: Train backward models

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export seed_prefix=101
export problem=translate_deen_wmt16_bpe32k
export model_name=big_tfm_baseline_df3584_s${seed_prefix}
export data_dir=`pwd`/data_fairseq/$problem

for index in {1..3}
do
export model_dir=train_fairseq/${problem}/${model_name}/model_${index}
fairseq-train \
    ${data_dir} \
    -s de -t en \
    --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-update 43000 \
    --keep-last-epochs 10 \
    --save-dir ${model_dir} \
    --ddp-backend no_c10d \
    --seed ${seed_prefix}${index} \
    --max-tokens 3584 \
    --fp16 --update-freq 16 --log-interval 10000 --no-progress-bar
done

Step 5: Inference forward models

export CUDA_VISIBLE_DEVICES=0
export seed_prefix=100
export problem=translate_ende_wmt16_bpe32k
export model_name=big_tfm_baseline_df3584_s${seed_prefix}
export data_dir=`pwd`/data_fairseq/$problem
export beam=5
export lenpen=0.6
export round=1

for index in {1..3}
do
export model_dir=train_fairseq/${problem}/${model_name}/model_${index}
export best_file=$model_dir/checkpoint_best.pt
export gen_out=$model_dir/infer_train_b${beam}_lp${lenpen}
fairseq-generate ${data_dir} \
    -s en -t de \
    --path ${best_file} \
    --gen-subset train \
    --max-tokens ${infer_bsz} --beam ${beam}  --lenpen ${lenpen} | dd of=$gen_out
grep ^S ${gen_out} | cut -f2- > $gen_out.en
grep ^H ${gen_out} | cut -f3- > $gen_out.de
done

Step 6: Inference backward models

export CUDA_VISIBLE_DEVICES=0
export seed_prefix=101
export problem=translate_deen_wmt16_bpe32k
export model_name=big_tfm_baseline_df3584_s${seed_prefix}
export data_dir=`pwd`/data_fairseq/$problem
export beam=5
export lenpen=0.6
export round=1

for index in {1..3}
do
export model_dir=train_fairseq/${problem}/${model_name}/model_${index}
export best_file=$model_dir/checkpoint_best.pt
export gen_out=$model_dir/infer_train_b${beam}_lp${lenpen}
fairseq-generate ${data_dir} \
    -s de -t en \
    --path ${best_file} \
    --gen-subset train \
    --max-tokens ${infer_bsz} --beam ${beam}  --lenpen ${lenpen} | dd of=$gen_out
grep ^S ${gen_out} | cut -f2- > $gen_out.de
grep ^H ${gen_out} | cut -f3- > $gen_out.en
done

Step 7: Merge and filter duplicates with the original dataset


export ori=raw_data/wmt_ende/train.tok.clean.bpe.32000
export bw_prefix=train_fairseq/translate_deen_wmt16_bpe32k/big_tfm_baseline_df3584_s101/model_
export fw_prefix=train_fairseq/translate_ende_wmt16_bpe32k/big_tfm_baseline_df3584_s100/model_
export prefix=
for i in {1..3}
do
  export prefix=$bw_prefix$i/infer_train_b5_lp0.6:$prefix
done
for i in {1..3}
do
  export prefix=$fw_prefix$i/infer_train_b5_lp0.6:$prefix
done

mkdir -p raw_data/aug_ende_wmt16_bpe32k_s3_r1
python -u combine_corpus.py --src en --tgt de --ori $ori --hypos $prefix --dir raw_data/aug_ende_wmt16_bpe32k_s3_r1 --out train

export out=data_fairseq/translate_ende_aug_b5_r1_s3_nodup_wmt16_bpe32k
# Copy the original data to new augmented data. We keep the valid/test set the same, only change the train set
cp -r data_fairseq/translate_ende_wmt16_bpe32k $out

fairseq-preprocess --source-lang en --target-lang de \
  --trainpref raw_data/aug_ende_wmt16_bpe32k_s3_r1/train \
  --destdir $out \
  --nwordssrc 0 --nwordstgt 0 \
  --workers 16 \
  --srcdict $out/dict.en.txt --tgtdict $out/dict.de.txt

# This should report around 27M sentences

Step 8: Train final models

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export seed_prefix=200
export problem=translate_ende_aug_b5_r1_s3_nodup_wmt16_bpe32k
export model_name=big_tfm_baseline_df3584_s${seed_prefix}
export data_dir=`pwd`/data_fairseq/$problem
export index=1
export model_dir=train_fairseq/${problem}/${model_name}/model_${index}
fairseq-train \
    ${data_dir} \
    -s en -t de \
    --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-update 43000 \
    --keep-last-epochs 10 \
    --save-dir ${model_dir} \
    --ddp-backend no_c10d \
    --seed ${seed_prefix}${index} \
    --max-tokens 3584 \
    --fp16 --update-freq 16 --log-interval 10000 --no-progress-bar

export avg_checkpoint=$model_dir/checkpoint_avg5.pt

# average checkpoints
python average_checkpoints.py \
        --inputs ${model_dir} \
        --num-epoch-checkpoints 5 \
        --checkpoint-upper-bound 10000 \
        --output ${avg_checkpoint}

export gen_out=$model_dir/infer.test.avg5.b5.lp0.6
export ref=${gen_out}.ref
export hypo=${gen_out}.hypo
export ref_atat=${ref}.atat
export hypo_atat=${hypo}.atat
export beam=5
export lenpen=0.6
echo "Finish generating averaged, start generating samples"
fairseq-generate ${data_dir} \
-s en -t de \
--gen-subset test \
--path ${avg_checkpoint} \
--max-tokens 2048 \
--beam ${beam}  \
--lenpen ${lenpen} \
--remove-bpe | dd of=${gen_out}
grep ^T ${gen_out} | cut -f2- > ${ref}
grep ^H ${gen_out} | cut -f3- > ${hypo}

perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < ${hypo} > ${hypo_atat}
perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < ${ref} > ${ref_atat}
echo "------ Score BLEU ------------"
$(which fairseq-score) --sys ${hypo_atat} --ref ${ref_atat}
# expected: BLEU4 = 30.7