Skip to content
This repository was archived by the owner on Nov 21, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 118 additions & 13 deletions download-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,26 @@ set -o pipefail
SRC=en
SI_TGT=si
NE_TGT=ne
KM_TGT=km
PS_TGT=ps
FA_TGT=fa

ROOT=$(dirname "$0")
DATA=$ROOT/data
NE_ROOT=$DATA/all-clean-ne
SI_ROOT=$DATA/all-clean-si
HI_ROOT=$DATA/all-clean-hi

mkdir -p $DATA $NE_ROOT $SI_ROOT $HI_ROOT
KM_ROOT=$DATA/all-clean-km
PS_ROOT=$DATA/all-clean-ps
FA_ROOT=$DATA/all-clean-fa

mkdir -p $DATA \
$NE_ROOT \
$SI_ROOT \
$HI_ROOT \
$KM_ROOT \
$PS_ROOT \
$FA_ROOT

SI_OPUS_DATASETS=(
"$SI_ROOT/GNOME.en-si"
Expand Down Expand Up @@ -48,6 +60,92 @@ NE_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip"
)

KM_OPUS_DATASETS=(
"$KM_ROOT/GNOME.en-km"
"$KM_ROOT/Ubuntu.en-km"
"$KM_ROOT/KDE4.en-km"
"$KM_ROOT/GlobalVoices.en-km"
"$KM_ROOT/Tatoeba.en-km"
)

KM_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-km.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-km.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-km.txt.zip"
"https://object.pouta.csc.fi/OPUS-GlobalVoices/v2017q3/moses/en-km.txt.zip"
"https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-km.txt.zip"
)

PS_OPUS_DATASETS=(
"$PS_ROOT/GNOME.en-ps"
"$PS_ROOT/Ubuntu.en-ps"
"$PS_ROOT/KDE4.en-ps"
"$PS_ROOT/Tatoeba.en-ps"
"$PS_ROOT/Wikimedia.en-ps"
)

PS_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ps.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ps.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ps.txt.zip"
"https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-ps.txt.zip"
"https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/moses/en-ps.txt.zip"
)

FA_OPUS_DATASETS=(
"$FA_ROOT/OpenSubtitles.en-fa"
"$FA_ROOT/Tanzil.en-fa"
"$FA_ROOT/TEP.en-fa"
"$FA_ROOT/QED.en-fa"
"$FA_ROOT/Wikipedia.en-fa"
"$FA_ROOT/GNOME.en-fa"
"$FA_ROOT/TED2013.en-fa"
"$FA_ROOT/infopankki.en-fa"
"$FA_ROOT/KDE4.en-fa"
"$FA_ROOT/Ubuntu.en-fa"
"$FA_ROOT/GlobalVoices.en-fa"
)

FA_OPUS_URLS=(
"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-QED/v2.0a/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-Wikipedia/v1.0/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-TED2013/v1.1/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-infopankki/v1/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-fa.txt.zip"
"https://object.pouta.csc.fi/OPUS-GlobalVoices/v2017q3/moses/en-fa.txt.zip"
)

HI_OPUS_DATASETS=(
"$HI_ROOT/Tanzil.en-hi"
"$HI_ROOT/Tatoeba.en-hi"
"$HI_ROOT/GNOME.en-hi"
"$HI_ROOT/QED.en-hi"
"$HI_ROOT/bible-uedin.en-hi"
"$HI_ROOT/OpenSubtitles.en-hi"
"$HI_ROOT/KDE4.en-hi"
"$HI_ROOT/Ubuntu.en-hi"
"$HI_ROOT/WMT-News.en-hi"
"$HI_ROOT/GlobalVoices.en-hi"
)

HI_OPUS_URLs=(
"https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-QED/v2.0a/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-WMT-News/v2019/moses/en-hi.txt.zip"
"https://object.pouta.csc.fi/OPUS-GlobalVoices/v2017q3/moses/en-hi.txt.zip"
)

REMOVE_FILE_PATHS=()

# Download data
Expand All @@ -74,30 +172,34 @@ download_data() {
download_opus_data() {
LANG_ROOT=$1
TGT=$2

if [ "$TGT" = "si" ]; then
URLS=("${SI_OPUS_URLS[@]}")
DATASETS=("${SI_OPUS_DATASETS[@]}")
else
elif [ "$TGT" = "ne" ]; then
URLS=("${NE_OPUS_URLS[@]}")
DATASETS=("${NE_OPUS_DATASETS[@]}")
elif [ "$TGT" = "km" ]; then
URLS=("${KM_OPUS_URLS[@]}")
DATASETS=("${KM_OPUS_DATASETS[@]}")
elif [ "$TGT" = "ps" ]; then
URLS=("${PS_OPUS_URLS[@]}")
DATASETS=("${PS_OPUS_DATASETS[@]}")
elif [ "$TGT" = "fa" ]; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have Farsi in the original flores. Is there a reason why are you including it in the target?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the intent was to improve the Pashto performance of a multilingual model by training for Farsi as well, being a related language. The Pashto-English parallel corpus was very limited so being able to make use of the Farsi-English corpus would hopefully be a boon. I don't recall whether we tested this or if we did whether it had an impact.

URLS=("${FA_OPUS_URLS[@]}")
DATASETS=("${FA_OPUS_DATASETS[@]}")
else #[ "$TGT" = "hi" ]; then
URLS=("${HI_OPUS_URLS[@]}")
DATASETS=("${HI_OPUS_DATASETS[@]}")
fi

# Download and extract data
for ((i=0;i<${#URLS[@]};++i)); do
URL=${URLS[i]}
CORPORA=${DATASETS[i]}

download_data $CORPORA $URL
unzip -o $CORPORA -d $LANG_ROOT
REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE )
done

cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC
cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT

REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC )
REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT )
}

download_opus_data $SI_ROOT $SI_TGT
Expand All @@ -106,7 +208,10 @@ cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TG
REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT )

download_opus_data $NE_ROOT $NE_TGT

download_opus_data $KM_ROOT $KM_TGT
download_opus_data $PS_ROOT $PS_TGT
download_opus_data $FA_ROOT $FA_TGT
#download_opus_data $HI_ROOT $HI_TGT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove commented code


# Download and extract Global Voices data
GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en"
Expand Down Expand Up @@ -153,7 +258,7 @@ REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $X

# Download parallel en-hi corpus
download_data $DATA/en-hi.tgz "http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz"
#download_data $DATA/en-hi.tgz "https://www.cse.iitb.ac.in/~anoopk/share/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz"
download_data $DATA/en-hi.tgz "https://www.cse.iitb.ac.in/~anoopk/share/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz"
tar xvzf $DATA/en-hi.tgz
cp parallel/* $HI_ROOT/
REMOVE_FILE_PATHS+=( parallel $DATA/en-hi.tgz )
Expand Down
156 changes: 156 additions & 0 deletions prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
#!/bin/bash

SRC="ps"
SRCS="ps km hi fa"
TGT=en

BPESIZE=15000
TRAIN_MINLEN=1 # remove sentences with <1 BPE token
TRAIN_MAXLEN=250 # remove sentences with >250 BPE tokens

ROOT=$(dirname "$0")
SCRIPTS=$ROOT/scripts
DATA=$ROOT/data
mkdir -p $DATA

SPM_TRAIN=$SCRIPTS/spm_train.py
SPM_ENCODE=$SCRIPTS/spm_encode.py

TRAIN_SETS_PS=(
"all-clean-ps/GNOME.en-ps"
"all-clean-ps/KDE4.en-ps"
"all-clean-ps/Tatoeba.en-ps"
"all-clean-ps/Ubuntu.en-ps"
"all-clean-ps/wikimedia.en-ps"
)
TRAIN_SETS_KM=(
"all-clean-km/GlobalVoices.en-km"
"all-clean-km/GNOME.en-km"
"all-clean-km/KDE4.en-km"
"all-clean-km/Tatoeba.en-km"
"all-clean-km/Ubuntu.en-km"
)
TRAIN_SETS_HI=("all-clean-hi/IITB.en-hi")
DATA_SETS_FA=(
"all-clean-fa/GlobalVoices.en-fa"
"all-clean-fa/GNOME.en-fa"
"all-clean-fa/infopankki.en-fa"
"all-clean-fa/KDE4.en-fa"
"all-clean-fa/OpenSubtitles.en-fa"
"all-clean-fa/QED.en-fa"
"all-clean-fa/Tanzil.en-fa"
"all-clean-fa/TED2013.en-fa"
"all-clean-fa/TEP.en-fa"
"all-clean-fa/Ubuntu.en-fa"
"all-clean-fa/Wikipedia.en-fa"
)

echo "Aggregating splits"
for FILE in "${TRAIN_SETS_PS[@]}" ; do cat "${DATA}/$FILE.ps"; done > $DATA/undeduped-train.ps-en.ps
for FILE in "${TRAIN_SETS_PS[@]}" ; do cat "${DATA}/$FILE.en"; done > $DATA/undeduped-train.ps-en.en
for FILE in "${TRAIN_SETS_KM[@]}" ; do cat "${DATA}/$FILE.km"; done > $DATA/undeduped-train.km-en.km
for FILE in "${TRAIN_SETS_KM[@]}" ; do cat "${DATA}/$FILE.en"; done > $DATA/undeduped-train.km-en.en
for FILE in "${TRAIN_SETS_HI[@]}" ; do cat "${DATA}/$FILE.hi"; done > $DATA/undeduped-train.hi-en.hi
for FILE in "${TRAIN_SETS_HI[@]}" ; do cat "${DATA}/$FILE.en"; done > $DATA/undeduped-train.hi-en.en
for FILE in "${DATA_SETS_FA[@]}" ; do cat "${DATA}/$FILE.fa"; done | awk 'NR % 1500 > 1' > $DATA/undeduped-train.fa-en.fa
for FILE in "${DATA_SETS_FA[@]}" ; do cat "${DATA}/$FILE.en"; done | awk 'NR % 1500 > 1' > $DATA/undeduped-train.fa-en.en
for FILE in "${DATA_SETS_FA[@]}" ; do cat "${DATA}/$FILE.fa"; done | awk 'NR % 1500 == 0' > $DATA/undeduped-valid.fa-en.fa
for FILE in "${DATA_SETS_FA[@]}" ; do cat "${DATA}/$FILE.en"; done | awk 'NR % 1500 == 0' > $DATA/undeduped-valid.fa-en.en
for FILE in "${DATA_SETS_FA[@]}" ; do cat "${DATA}/$FILE.fa"; done | awk 'NR % 1500 == 1' > $DATA/undeduped-test.fa-en.fa
for FILE in "${DATA_SETS_FA[@]}" ; do cat "${DATA}/$FILE.en"; done | awk 'NR % 1500 == 1' > $DATA/undeduped-test.fa-en.en
echo "Deduplicating splits"
dedupe () {
dedupe_set=$1
dedupe_lang=$2
python3 scripts/deduplicate.py \
--input-src $DATA/undeduped-$dedupe_set.$dedupe_lang-en.$dedupe_lang \
--input-tgt $DATA/undeduped-$dedupe_set.$dedupe_lang-en.en \
--output-src $DATA/$dedupe_set.$dedupe_lang-en.$dedupe_lang \
--output-tgt $DATA/$dedupe_set.$dedupe_lang-en.en
}
dedupe "train" "ps"
dedupe "train" "km"
dedupe "train" "hi"
dedupe "train" "fa"
dedupe "valid" "fa"
dedupe "test" "fa"
cat "${DEVTEST_PSKM}/ps-en.dev.ps" > $DATA/valid.ps-en.ps
cat "${DEVTEST_PSKM}/ps-en.dev.en" > $DATA/valid.ps-en.en
cat "${DEVTEST_PSKM}/ps-en.devtest.ps" > $DATA/test.ps-en.ps
cat "${DEVTEST_PSKM}/ps-en.devtest.en" > $DATA/test.ps-en.en
cat "${DEVTEST_PSKM}/km-en.dev.km" > $DATA/valid.km-en.km
cat "${DEVTEST_PSKM}/km-en.dev.en" > $DATA/valid.km-en.en
cat "${DEVTEST_PSKM}/km-en.devtest.km" > $DATA/test.km-en.km
cat "${DEVTEST_PSKM}/km-en.devtest.en" > $DATA/test.km-en.en
cat "${DEVTEST_HI}/dev.hi" > $DATA/valid.hi-en.hi
cat "${DEVTEST_HI}/dev.en" > $DATA/valid.hi-en.en
cat "${DEVTEST_HI}/test.hi" > $DATA/test.hi-en.hi
cat "${DEVTEST_HI}/test.en" > $DATA/test.hi-en.en
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DEVTEST_PSKM and DEVTEST_HI are placeholders for where these datasets will eventually live.

echo "Creating abridged train sets for BPE, so all languages are similarly well-represented"
cat $DATA/train.ps-en.ps | head -100000 > $DATA/abridged.ps
cat $DATA/train.km-en.km | head -100000 > $DATA/abridged.km
cat $DATA/train.hi-en.hi | head -100000 > $DATA/abridged.hi
cat $DATA/train.fa-en.fa | head -100000 > $DATA/abridged.fa
cat $DATA/train.ps-en.en | head -100000 > $DATA/abridged.en
cat $DATA/train.km-en.en | head -100000 >> $DATA/abridged.en
cat $DATA/train.hi-en.en | head -100000 >> $DATA/abridged.en
cat $DATA/train.fa-en.en | head -100000 >> $DATA/abridged.en
echo "Concatenating target data"
cat $DATA/train.ps-en.en > $DATA/train-concatenated.en
cat $DATA/train.km-en.en >> $DATA/train-concatenated.en
cat $DATA/train.hi-en.en >> $DATA/train-concatenated.en
cat $DATA/train.fa-en.en >> $DATA/train-concatenated.en
cat $DATA/valid.ps-en.en > $DATA/valid-concatenated.en
cat $DATA/valid.km-en.en >> $DATA/valid-concatenated.en
cat $DATA/valid.hi-en.en >> $DATA/valid-concatenated.en
cat $DATA/valid.fa-en.en >> $DATA/valid-concatenated.en
cat $DATA/test.ps-en.en > $DATA/test-concatenated.en
cat $DATA/test.km-en.en >> $DATA/test-concatenated.en
cat $DATA/test.hi-en.en >> $DATA/test-concatenated.en
cat $DATA/test.fa-en.en >> $DATA/test-concatenated.en
echo "Learning BPE with sentencepiece"
python $SPM_TRAIN \
--input=$DATA/abridged.ps,$DATA/abridged.km,$DATA/abridged.hi,$DATA/abridged.fa,$DATA/abridged.en \
--model_prefix=$DATA/sentencepiece.bpe \
--vocab_size=$BPESIZE \
--character_coverage=1.0 \
--model_type=bpe
echo "Encoding"
for SOMESRC in $SRCS; do
python $SPM_ENCODE \
--model $DATA/sentencepiece.bpe.model \
--output_format=piece \
--inputs $DATA/train.$SOMESRC-en.$SOMESRC $DATA/train.$SOMESRC-en.en \
--outputs $DATA/train.bpe.$SOMESRC-en.$SOMESRC $DATA/train.bpe.$SOMESRC-en.en \
--min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN
for SPLIT in "valid" "test"; do
python $SPM_ENCODE \
--model $DATA/sentencepiece.bpe.model \
--output_format=piece \
--inputs $DATA/$SPLIT.$SOMESRC-en.$SOMESRC $DATA/$SPLIT.$SOMESRC-en.en \
--outputs $DATA/$SPLIT.bpe.$SOMESRC-en.$SOMESRC $DATA/$SPLIT.bpe.$SOMESRC-en.en
done
done
echo "Converting sentencepiece vocabulary into fairseq dictionary"
tail -n +4 $DATA/sentencepiece.bpe.vocab | awk -F'\t' 'BEGIN{OFS=" "} {print $1, 100}' > $DATA/vocab
echo "Preprocessing"
for SOMESRC in $SRCS; do
echo "Binarizing ${SOMESRC}"
fairseq-preprocess \
--source-lang $SOMESRC --target-lang en \
--destdir $DATA \
--joined-dictionary \
--workers 4 \
--trainpref $DATA/train.bpe.$SOMESRC-en \
--validpref $DATA/valid.bpe.$SOMESRC-en \
--testpref $DATA/test.bpe.$SOMESRC-en \
--srcdict $DATA/vocab
mv "${DATA}/dict.en.txt" "${DATA}/dict.en.txt-moved"
done
mv "${DATA}/dict.en.txt-moved" "${DATA}/dict.en.txt"
Loading