test_binaries_gpu.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. # Copyright (c) Facebook, Inc. and its affiliates.
  2. #
  3. # This source code is licensed under the MIT license found in the
  4. # LICENSE file in the root directory of this source tree.
  5. import contextlib
  6. import json
  7. import logging
  8. import os
  9. import tempfile
  10. import unittest
  11. from io import StringIO
  12. import torch
  13. from fairseq import options
  14. from fairseq_cli import train
  15. from tests.utils import (
  16. create_dummy_data,
  17. generate_main,
  18. preprocess_lm_data,
  19. preprocess_translation_data,
  20. train_language_model,
  21. train_translation_model,
  22. )
  23. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  24. class TestMultiGPU(unittest.TestCase):
  25. @staticmethod
  26. def parse_logs(logfile):
  27. logs = []
  28. for ln in open(logfile, "r").readlines():
  29. try:
  30. logs.append(json.loads(ln))
  31. except json.JSONDecodeError:
  32. continue
  33. return logs
  34. @property
  35. def world_size(self):
  36. return torch.cuda.device_count()
  37. def train_flags(self, mu):
  38. return [
  39. "--memory-efficient-fp16",
  40. "--update-freq",
  41. "1",
  42. "--seed",
  43. "1",
  44. "--log-format",
  45. "json",
  46. "--max-update",
  47. str(mu),
  48. "--tokens-per-sample",
  49. "20",
  50. "--batch-size",
  51. "2",
  52. "--share-decoder-input-output-embed",
  53. "--optimizer",
  54. "adam",
  55. "--max-valid-steps",
  56. "1",
  57. "--pad-to-fixed-length",
  58. "--sample-break-mode",
  59. "none",
  60. ]
  61. def _test_resume_multilingual_training(
  62. self, extra_clargs, arch="transformer_lm_gpt2_tiny"
  63. ):
  64. languages = ["en_XX", "fr_XX", "zh_CN"]
  65. save_interval = 5
  66. mu = 10
  67. flags = (
  68. self.train_flags(mu)
  69. + ["--save-interval-updates", str(save_interval), "--log-interval", "1"]
  70. + extra_clargs
  71. )
  72. with contextlib.redirect_stdout(StringIO()):
  73. with tempfile.TemporaryDirectory("test_fp16") as data_dir:
  74. log = os.path.join(data_dir, "train.log")
  75. create_dummy_data(
  76. data_dir,
  77. num_examples=int(
  78. mu * 20 * self.world_size * 1.5
  79. ), # make sure enough data for max updates
  80. languages=languages,
  81. )
  82. preprocess_lm_data(data_dir, languages)
  83. train_language_model(
  84. data_dir,
  85. arch,
  86. flags + ["--log-file", log],
  87. task="multilingual_language_modeling",
  88. world_size=self.world_size,
  89. )
  90. log2 = os.path.join(data_dir, "resume.log")
  91. ckpt_name = f"checkpoint_1_{save_interval}.pt"
  92. restore_file = os.path.join(data_dir, ckpt_name)
  93. train_language_model(
  94. data_dir,
  95. arch,
  96. flags
  97. + ["--log-file", log2, "--restore-file", restore_file, "--no-save"],
  98. task="multilingual_language_modeling",
  99. world_size=self.world_size,
  100. )
  101. l1 = self.parse_logs(log)
  102. assert (
  103. int(l1[-1]["train_num_updates"]) == mu
  104. ), f"The first run did not complete {mu} updates. Add more data"
  105. l2 = self.parse_logs(log2)
  106. if int(l2[0]["num_updates"]) != save_interval + 1:
  107. all_ckpt_files = [
  108. x for x in os.listdir(data_dir) if x.endswith(".pt")
  109. ]
  110. import shutil
  111. shutil.move(data_dir, "last_failed_resume")
  112. raise AssertionError(
  113. f"Likely failed to load {ckpt_name}. {all_ckpt_files} \n LOGS: {l1} \n\n {l2}. "
  114. )
  115. for k in [
  116. "train_loss",
  117. "train_num_updates",
  118. "train_ppl",
  119. "train_gnorm",
  120. ]:
  121. from_scratch, resumed = float(l1[-1][k]), float(l2[-1][k])
  122. # This fails without rounding!
  123. assert (
  124. from_scratch == resumed
  125. ), f"difference at {k} {from_scratch} != {resumed}"
  126. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  127. class TestTranslationGPU(unittest.TestCase):
  128. def setUp(self):
  129. logging.disable(logging.CRITICAL)
  130. def tearDown(self):
  131. logging.disable(logging.NOTSET)
  132. def test_fp16_multigpu(self):
  133. self._test_multigpu("test_fp16", ["--fp16"])
  134. def test_slowmo_multigpu(self):
  135. self._test_multigpu(
  136. "test_slowmo", ["--ddp-backend", "slowmo", "--nprocs-per-node", "1"]
  137. )
  138. def test_slowmo_single_node_multigpu(self):
  139. self._test_multigpu(
  140. "test_slowmo_single_node",
  141. ["--ddp-backend", "slowmo", "--nprocs-per-node", "2"],
  142. )
  143. def _test_multigpu(self, test_name, test_args):
  144. with contextlib.redirect_stdout(StringIO()):
  145. with tempfile.TemporaryDirectory(test_name) as data_dir:
  146. log = os.path.join(data_dir, "train.log")
  147. create_dummy_data(data_dir)
  148. preprocess_translation_data(data_dir)
  149. train_translation_model(
  150. data_dir,
  151. "fconv_iwslt_de_en",
  152. test_args + ["--log-file", log],
  153. world_size=min(torch.cuda.device_count(), 2),
  154. )
  155. generate_main(data_dir)
  156. assert os.path.exists(log)
  157. @staticmethod
  158. def parse_logs(logfile):
  159. logs = []
  160. for ln in open(logfile, "r").readlines():
  161. try:
  162. logs.append(json.loads(ln))
  163. except json.JSONDecodeError:
  164. continue
  165. return logs
  166. def test_resume_training_fsdp(self):
  167. self._test_resume_training(["--ddp-backend", "fully_sharded"])
  168. def test_resume_training_fsdp_sharded_state(self):
  169. self._test_resume_training(
  170. ["--ddp-backend", "fully_sharded", "--use-sharded-state"]
  171. )
  172. def test_resume_training_noc10d(self):
  173. self._test_resume_training([])
  174. def _test_resume_training(self, extra_clargs, arch="fconv_iwslt_de_en"):
  175. flags = [
  176. "--fp16",
  177. "--log-format",
  178. "json",
  179. "--max-update",
  180. "10",
  181. "--save-interval-updates",
  182. "2",
  183. "--log-interval",
  184. "1",
  185. ] + extra_clargs
  186. world_size = min(torch.cuda.device_count(), 2)
  187. with contextlib.redirect_stdout(StringIO()):
  188. with tempfile.TemporaryDirectory("test_fp16") as data_dir:
  189. log = os.path.join(data_dir, "train.log")
  190. create_dummy_data(data_dir)
  191. preprocess_translation_data(data_dir)
  192. train_translation_model(
  193. data_dir,
  194. arch,
  195. flags + ["--log-file", log],
  196. world_size=world_size,
  197. )
  198. log2 = os.path.join(data_dir, "resume.log")
  199. restore_file = os.path.join(data_dir, "checkpoint_1_2.pt")
  200. train_translation_model(
  201. data_dir,
  202. arch,
  203. flags + ["--log-file", log2, "--restore-file", restore_file],
  204. world_size=world_size,
  205. )
  206. l1 = self.parse_logs(log)
  207. l2 = self.parse_logs(log2)
  208. assert int(l2[0]["num_updates"]) == 3, f"{l1}\n\n {l2}"
  209. for k in [
  210. "train_loss",
  211. "train_num_updates",
  212. "train_ppl",
  213. "train_gnorm",
  214. ]:
  215. from_scratch, resumed = l1[-1][k], l2[-1][k]
  216. assert (
  217. from_scratch == resumed
  218. ), f"difference at {k} {from_scratch} != {resumed}"
  219. def test_memory_efficient_fp16(self):
  220. with contextlib.redirect_stdout(StringIO()):
  221. with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir:
  222. create_dummy_data(data_dir)
  223. preprocess_translation_data(data_dir)
  224. train_translation_model(
  225. data_dir, "fconv_iwslt_de_en", ["--memory-efficient-fp16"]
  226. )
  227. generate_main(data_dir)
  228. def test_transformer_fp16(self):
  229. with contextlib.redirect_stdout(StringIO()):
  230. with tempfile.TemporaryDirectory("test_transformer") as data_dir:
  231. create_dummy_data(data_dir)
  232. preprocess_translation_data(data_dir)
  233. train_translation_model(
  234. data_dir,
  235. "transformer_iwslt_de_en",
  236. [
  237. "--encoder-layers",
  238. "2",
  239. "--decoder-layers",
  240. "2",
  241. "--encoder-embed-dim",
  242. "64",
  243. "--decoder-embed-dim",
  244. "64",
  245. "--fp16",
  246. ],
  247. run_validation=True,
  248. )
  249. generate_main(data_dir)
  250. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  251. def test_amp(self):
  252. with contextlib.redirect_stdout(StringIO()):
  253. with tempfile.TemporaryDirectory("test_amp") as data_dir:
  254. create_dummy_data(data_dir)
  255. preprocess_translation_data(data_dir)
  256. train_translation_model(data_dir, "fconv_iwslt_de_en", ["--amp"])
  257. generate_main(data_dir)
  258. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  259. def test_transformer_amp(self):
  260. with contextlib.redirect_stdout(StringIO()):
  261. with tempfile.TemporaryDirectory("test_transformer") as data_dir:
  262. create_dummy_data(data_dir)
  263. preprocess_translation_data(data_dir)
  264. train_translation_model(
  265. data_dir,
  266. "transformer_iwslt_de_en",
  267. [
  268. "--encoder-layers",
  269. "2",
  270. "--decoder-layers",
  271. "2",
  272. "--encoder-embed-dim",
  273. "64",
  274. "--decoder-embed-dim",
  275. "64",
  276. "--amp",
  277. ],
  278. run_validation=True,
  279. )
  280. generate_main(data_dir)
  281. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  282. def test_levenshtein_transformer(self):
  283. with contextlib.redirect_stdout(StringIO()):
  284. with tempfile.TemporaryDirectory(
  285. "test_levenshtein_transformer"
  286. ) as data_dir:
  287. create_dummy_data(data_dir)
  288. preprocess_translation_data(data_dir, ["--joined-dictionary"])
  289. train_translation_model(
  290. data_dir,
  291. "levenshtein_transformer",
  292. [
  293. "--apply-bert-init",
  294. "--early-exit",
  295. "6,6,6",
  296. "--criterion",
  297. "nat_loss",
  298. ],
  299. task="translation_lev",
  300. )
  301. gen_config = [
  302. "--task",
  303. "translation_lev",
  304. "--iter-decode-max-iter",
  305. "9",
  306. "--iter-decode-eos-penalty",
  307. "0",
  308. "--print-step",
  309. ]
  310. # non-ensemble generation
  311. generate_main(data_dir, gen_config)
  312. # ensemble generation
  313. generate_main(
  314. data_dir,
  315. gen_config,
  316. path=os.pathsep.join(
  317. [
  318. os.path.join(data_dir, "checkpoint_last.pt"),
  319. os.path.join(data_dir, "checkpoint_last.pt"),
  320. ]
  321. ),
  322. )
  323. def test_fsdp_checkpoint_generate(self):
  324. with contextlib.redirect_stdout(StringIO()):
  325. with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir:
  326. log = os.path.join(data_dir, "train.log")
  327. create_dummy_data(data_dir)
  328. preprocess_translation_data(data_dir)
  329. world_size = min(torch.cuda.device_count(), 2)
  330. train_translation_model(
  331. data_dir,
  332. "fconv_iwslt_de_en",
  333. ["--log-file", log, "--ddp-backend", "fully_sharded"],
  334. world_size=world_size,
  335. )
  336. generate_main(data_dir)
  337. assert os.path.exists(log)
  338. def test_fsdp_sharded_checkpoint_generate(self):
  339. with contextlib.redirect_stdout(StringIO()):
  340. with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir:
  341. log = os.path.join(data_dir, "train.log")
  342. create_dummy_data(data_dir)
  343. preprocess_translation_data(data_dir)
  344. world_size = min(torch.cuda.device_count(), 2)
  345. train_translation_model(
  346. data_dir,
  347. "fconv_iwslt_de_en",
  348. [
  349. "--log-file",
  350. log,
  351. "--ddp-backend",
  352. "fully_sharded",
  353. "--use-sharded-state",
  354. ],
  355. world_size=world_size,
  356. )
  357. generate_main(data_dir, ["--checkpoint-shard-count", str(world_size)])
  358. assert os.path.exists(log)
  359. def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False):
  360. train_parser = options.get_training_parser()
  361. train_args = options.parse_args_and_arch(
  362. train_parser,
  363. [
  364. "--task",
  365. "language_modeling",
  366. data_dir,
  367. "--arch",
  368. arch,
  369. "--optimizer",
  370. "adam",
  371. "--lr",
  372. "0.0001",
  373. "--criterion",
  374. "adaptive_loss",
  375. "--adaptive-softmax-cutoff",
  376. "5,10,15",
  377. "--max-tokens",
  378. "500",
  379. "--tokens-per-sample",
  380. "500",
  381. "--save-dir",
  382. data_dir,
  383. "--max-epoch",
  384. "1",
  385. "--no-progress-bar",
  386. "--distributed-world-size",
  387. "1",
  388. "--ddp-backend",
  389. "no_c10d",
  390. "--num-workers",
  391. "0",
  392. ]
  393. + (extra_flags or []),
  394. )
  395. train.main(train_args)
  396. # try scalar quantization
  397. scalar_quant_train_parser = options.get_training_parser()
  398. scalar_quant_train_args = options.parse_args_and_arch(
  399. scalar_quant_train_parser,
  400. [
  401. "--task",
  402. "language_modeling",
  403. data_dir,
  404. "--arch",
  405. arch,
  406. "--optimizer",
  407. "adam",
  408. "--lr",
  409. "0.0001",
  410. "--criterion",
  411. "adaptive_loss",
  412. "--adaptive-softmax-cutoff",
  413. "5,10,15",
  414. "--max-tokens",
  415. "500",
  416. "--tokens-per-sample",
  417. "500",
  418. "--save-dir",
  419. data_dir,
  420. "--max-update",
  421. "3",
  422. "--no-progress-bar",
  423. "--distributed-world-size",
  424. "1",
  425. "--ddp-backend",
  426. "no_c10d",
  427. "--num-workers",
  428. "0",
  429. "--quant-noise-scalar",
  430. "0.5",
  431. ]
  432. + (extra_flags or []),
  433. )
  434. train.main(scalar_quant_train_args)
  435. # try iterative PQ quantization
  436. quantize_parser = options.get_training_parser()
  437. quantize_args = options.parse_args_and_arch(
  438. quantize_parser,
  439. [
  440. "--task",
  441. "language_modeling",
  442. data_dir,
  443. "--arch",
  444. arch,
  445. "--optimizer",
  446. "adam",
  447. "--lr",
  448. "0.0001",
  449. "--criterion",
  450. "adaptive_loss",
  451. "--adaptive-softmax-cutoff",
  452. "5,10,15",
  453. "--max-tokens",
  454. "50",
  455. "--tokens-per-sample",
  456. "50",
  457. "--max-update",
  458. "6",
  459. "--no-progress-bar",
  460. "--distributed-world-size",
  461. "1",
  462. "--ddp-backend",
  463. "no_c10d",
  464. "--num-workers",
  465. "0",
  466. "--restore-file",
  467. os.path.join(data_dir, "checkpoint_last.pt"),
  468. "--reset-optimizer",
  469. "--quantization-config-path",
  470. os.path.join(
  471. os.path.dirname(__file__), "transformer_quantization_config.yaml"
  472. ),
  473. ]
  474. + (extra_flags or []),
  475. )
  476. train.main(quantize_args)
  477. @unittest.skipIf(
  478. int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU"
  479. )
  480. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  481. class TestQuantization(unittest.TestCase):
  482. def setUp(self):
  483. logging.disable(logging.CRITICAL)
  484. def tearDown(self):
  485. logging.disable(logging.NOTSET)
  486. def test_quantization(self):
  487. with contextlib.redirect_stdout(StringIO()):
  488. with tempfile.TemporaryDirectory("test_quantization") as data_dir:
  489. create_dummy_data(data_dir)
  490. preprocess_lm_data(data_dir)
  491. # tests both scalar and iterative PQ quantization
  492. _quantize_language_model(data_dir, "transformer_lm")
  493. @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
  494. class TestOptimizersGPU(unittest.TestCase):
  495. def setUp(self):
  496. logging.disable(logging.CRITICAL)
  497. def tearDown(self):
  498. logging.disable(logging.NOTSET)
  499. def test_flat_grads(self):
  500. with contextlib.redirect_stdout(StringIO()):
  501. with tempfile.TemporaryDirectory("test_flat_grads") as data_dir:
  502. # Use just a bit of data and tiny model to keep this test runtime reasonable
  503. create_dummy_data(data_dir, num_examples=10, maxlen=5)
  504. preprocess_translation_data(data_dir)
  505. with self.assertRaises(RuntimeError):
  506. # adafactor isn't compatible with flat grads, which
  507. # are used by default with --fp16
  508. train_translation_model(
  509. data_dir,
  510. "lstm",
  511. [
  512. "--required-batch-size-multiple",
  513. "1",
  514. "--encoder-layers",
  515. "1",
  516. "--encoder-hidden-size",
  517. "32",
  518. "--decoder-layers",
  519. "1",
  520. "--optimizer",
  521. "adafactor",
  522. "--fp16",
  523. ],
  524. )
  525. # but it should pass once we set --fp16-no-flatten-grads
  526. train_translation_model(
  527. data_dir,
  528. "lstm",
  529. [
  530. "--required-batch-size-multiple",
  531. "1",
  532. "--encoder-layers",
  533. "1",
  534. "--encoder-hidden-size",
  535. "32",
  536. "--decoder-layers",
  537. "1",
  538. "--optimizer",
  539. "adafactor",
  540. "--fp16",
  541. "--fp16-no-flatten-grads",
  542. ],
  543. )
  544. if __name__ == "__main__":
  545. unittest.main()