split_train_valid_docs.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #!/usr/bin/env python3
  2. # Copyright (c) Facebook, Inc. and its affiliates.
  3. #
  4. # This source code is licensed under the MIT license found in the
  5. # LICENSE file in the root directory of this source tree.
  6. """
  7. Split a large file into a train and valid set while respecting document
  8. boundaries. Documents should be separated by a single empty line.
  9. """
  10. import argparse
  11. import random
  12. import sys
  13. def main():
  14. parser = argparse.ArgumentParser()
  15. parser.add_argument("input")
  16. parser.add_argument("sample_output", help="train output file")
  17. parser.add_argument("remainder_output", help="valid output file")
  18. parser.add_argument("-k", type=int, help="remainder size")
  19. parser.add_argument(
  20. "--lines", action="store_true", help="split lines instead of docs"
  21. )
  22. args = parser.parse_args()
  23. assert args.k is not None
  24. sample = []
  25. remainder = []
  26. num_docs = [0]
  27. def update_sample(doc):
  28. if len(sample) < args.k:
  29. sample.append(doc.copy())
  30. else:
  31. i = num_docs[0]
  32. j = random.randrange(i + 1)
  33. if j < args.k:
  34. remainder.append(sample[j])
  35. sample[j] = doc.copy()
  36. else:
  37. remainder.append(doc.copy())
  38. num_docs[0] += 1
  39. doc.clear()
  40. with open(args.input, "r", encoding="utf-8") as h:
  41. doc = []
  42. for i, line in enumerate(h):
  43. if line.strip() == "": # empty line indicates new document
  44. update_sample(doc)
  45. else:
  46. doc.append(line)
  47. if args.lines:
  48. update_sample(doc)
  49. if i % 1000000 == 0:
  50. print(i, file=sys.stderr, end="", flush=True)
  51. elif i % 100000 == 0:
  52. print(".", file=sys.stderr, end="", flush=True)
  53. if len(doc) > 0:
  54. update_sample(doc)
  55. print(file=sys.stderr, flush=True)
  56. assert len(sample) == args.k
  57. with open(args.sample_output, "w", encoding="utf-8") as out:
  58. first = True
  59. for doc in sample:
  60. if not first and not args.lines:
  61. out.write("\n")
  62. first = False
  63. for line in doc:
  64. out.write(line)
  65. with open(args.remainder_output, "w", encoding="utf-8") as out:
  66. first = True
  67. for doc in remainder:
  68. if not first and not args.lines:
  69. out.write("\n")
  70. first = False
  71. for line in doc:
  72. out.write(line)
  73. if __name__ == "__main__":
  74. main()