12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- # Copyright (c) Facebook, Inc. and its affiliates.
- #
- # This source code is licensed under the MIT license found in the
- # LICENSE file in the root directory of this source tree.
- """
- Use this script in order to build symmetric alignments for your translation
- dataset.
- This script depends on fast_align and mosesdecoder tools. You will need to
- build those before running the script.
- fast_align:
- github: http://github.com/clab/fast_align
- instructions: follow the instructions in README.md
- mosesdecoder:
- github: http://github.com/moses-smt/mosesdecoder
- instructions: http://www.statmt.org/moses/?n=Development.GetStarted
- The script produces the following files under --output_dir:
- text.joined - concatenation of lines from the source_file and the
- target_file.
- align.forward - forward pass of fast_align.
- align.backward - backward pass of fast_align.
- aligned.sym_heuristic - symmetrized alignment.
- """
- import argparse
- import os
- from itertools import zip_longest
- def main():
- parser = argparse.ArgumentParser(description="symmetric alignment builer")
- # fmt: off
- parser.add_argument('--fast_align_dir',
- help='path to fast_align build directory')
- parser.add_argument('--mosesdecoder_dir',
- help='path to mosesdecoder root directory')
- parser.add_argument('--sym_heuristic',
- help='heuristic to use for symmetrization',
- default='grow-diag-final-and')
- parser.add_argument('--source_file',
- help='path to a file with sentences '
- 'in the source language')
- parser.add_argument('--target_file',
- help='path to a file with sentences '
- 'in the target language')
- parser.add_argument('--output_dir',
- help='output directory')
- # fmt: on
- args = parser.parse_args()
- fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
- symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
- sym_fast_align_bin = os.path.join(
- args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
- )
- # create joined file
- joined_file = os.path.join(args.output_dir, "text.joined")
- with open(args.source_file, "r", encoding="utf-8") as src, open(
- args.target_file, "r", encoding="utf-8"
- ) as tgt:
- with open(joined_file, "w", encoding="utf-8") as joined:
- for s, t in zip_longest(src, tgt):
- print("{} ||| {}".format(s.strip(), t.strip()), file=joined)
- bwd_align_file = os.path.join(args.output_dir, "align.backward")
- # run forward alignment
- fwd_align_file = os.path.join(args.output_dir, "align.forward")
- fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
- FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
- )
- assert os.system(fwd_fast_align_cmd) == 0
- # run backward alignment
- bwd_align_file = os.path.join(args.output_dir, "align.backward")
- bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
- FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
- )
- assert os.system(bwd_fast_align_cmd) == 0
- # run symmetrization
- sym_out_file = os.path.join(args.output_dir, "aligned")
- sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
- SYMFASTALIGN=sym_fast_align_bin,
- FWD=fwd_align_file,
- BWD=bwd_align_file,
- SRC=args.source_file,
- TGT=args.target_file,
- OUT=sym_out_file,
- HEURISTIC=args.sym_heuristic,
- SYMAL=symal_bin,
- )
- assert os.system(sym_cmd) == 0
- if __name__ == "__main__":
- main()
|