build_sym_alignment.py 3.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # Copyright (c) Facebook, Inc. and its affiliates.
  2. #
  3. # This source code is licensed under the MIT license found in the
  4. # LICENSE file in the root directory of this source tree.
  5. """
  6. Use this script in order to build symmetric alignments for your translation
  7. dataset.
  8. This script depends on fast_align and mosesdecoder tools. You will need to
  9. build those before running the script.
  10. fast_align:
  11. github: http://github.com/clab/fast_align
  12. instructions: follow the instructions in README.md
  13. mosesdecoder:
  14. github: http://github.com/moses-smt/mosesdecoder
  15. instructions: http://www.statmt.org/moses/?n=Development.GetStarted
  16. The script produces the following files under --output_dir:
  17. text.joined - concatenation of lines from the source_file and the
  18. target_file.
  19. align.forward - forward pass of fast_align.
  20. align.backward - backward pass of fast_align.
  21. aligned.sym_heuristic - symmetrized alignment.
  22. """
  23. import argparse
  24. import os
  25. from itertools import zip_longest
  26. def main():
  27. parser = argparse.ArgumentParser(description="symmetric alignment builer")
  28. # fmt: off
  29. parser.add_argument('--fast_align_dir',
  30. help='path to fast_align build directory')
  31. parser.add_argument('--mosesdecoder_dir',
  32. help='path to mosesdecoder root directory')
  33. parser.add_argument('--sym_heuristic',
  34. help='heuristic to use for symmetrization',
  35. default='grow-diag-final-and')
  36. parser.add_argument('--source_file',
  37. help='path to a file with sentences '
  38. 'in the source language')
  39. parser.add_argument('--target_file',
  40. help='path to a file with sentences '
  41. 'in the target language')
  42. parser.add_argument('--output_dir',
  43. help='output directory')
  44. # fmt: on
  45. args = parser.parse_args()
  46. fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
  47. symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
  48. sym_fast_align_bin = os.path.join(
  49. args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
  50. )
  51. # create joined file
  52. joined_file = os.path.join(args.output_dir, "text.joined")
  53. with open(args.source_file, "r", encoding="utf-8") as src, open(
  54. args.target_file, "r", encoding="utf-8"
  55. ) as tgt:
  56. with open(joined_file, "w", encoding="utf-8") as joined:
  57. for s, t in zip_longest(src, tgt):
  58. print("{} ||| {}".format(s.strip(), t.strip()), file=joined)
  59. bwd_align_file = os.path.join(args.output_dir, "align.backward")
  60. # run forward alignment
  61. fwd_align_file = os.path.join(args.output_dir, "align.forward")
  62. fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
  63. FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
  64. )
  65. assert os.system(fwd_fast_align_cmd) == 0
  66. # run backward alignment
  67. bwd_align_file = os.path.join(args.output_dir, "align.backward")
  68. bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
  69. FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
  70. )
  71. assert os.system(bwd_fast_align_cmd) == 0
  72. # run symmetrization
  73. sym_out_file = os.path.join(args.output_dir, "aligned")
  74. sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
  75. SYMFASTALIGN=sym_fast_align_bin,
  76. FWD=fwd_align_file,
  77. BWD=bwd_align_file,
  78. SRC=args.source_file,
  79. TGT=args.target_file,
  80. OUT=sym_out_file,
  81. HEURISTIC=args.sym_heuristic,
  82. SYMAL=symal_bin,
  83. )
  84. assert os.system(sym_cmd) == 0
  85. if __name__ == "__main__":
  86. main()