123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 |
- # Copyright (c) Facebook, Inc. and its affiliates.
- #
- # This source code is licensed under the MIT license found in the
- # LICENSE file in the root directory of this source tree.
- import functools
- import unittest
- from typing import Any, Dict, Sequence
- import fairseq
- import fairseq.options
- import fairseq.tasks
- import torch
- from tests.utils import dummy_dictionary
- VOCAB_SIZE = 100
- @fairseq.tasks.register_task("fake_task")
- class FakeTask(fairseq.tasks.LegacyFairseqTask):
- def __init__(self, args):
- super().__init__(args)
- self.dictionary = dummy_dictionary(VOCAB_SIZE - 4)
- assert len(self.dictionary) == VOCAB_SIZE
- @property
- def source_dictionary(self):
- return self.dictionary
- @property
- def target_dictionary(self):
- return self.dictionary
- @functools.lru_cache()
- def get_toy_model(
- device: str,
- architecture: str = "roberta_enc_dec",
- **extra_args: Any,
- ):
- assert device in ("gpu", "cpu")
- kwargs = {
- "arch": architecture,
- # Use characteristics dimensions
- "encoder_layers": 3,
- "encoder_embed_dim": 12,
- "encoder_ffn_embed_dim": 14,
- "encoder_attention_heads": 4,
- "decoder_layers": 3,
- "decoder_embed_dim": 12,
- "decoder_ffn_embed_dim": 14,
- "decoder_attention_heads": 4,
- # Disable dropout so we have comparable tests.
- "dropout": 0,
- "attention_dropout": 0,
- "activation_dropout": 0,
- "encoder_layerdrop": 0,
- # required args
- "tokens_per_sample": 256,
- "data": "/tmp/test_roberta",
- }
- kwargs.update(extra_args)
- fake_task = FakeTask(kwargs)
- args = fairseq.options.get_args(
- task="online_backtranslation",
- mono_langs="en,ro",
- valid_lang_pairs="en-ro",
- **kwargs,
- )
- torch.manual_seed(0)
- model = fake_task.build_model(args)
- if device == "gpu":
- model.cuda()
- return fake_task, model
- def mk_sample(
- lang: str, device: str, tok: Sequence[int] = None, batch_size: int = 2
- ) -> Dict[str, Any]:
- assert device in ("gpu", "cpu")
- if not tok:
- if lang == "en":
- tok = [10, 11, 12, 13, 14, 15, 2]
- else:
- tok = [20, 21, 22, 23, 24, 25, 26, 27, 2]
- batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size)
- if device == "gpu":
- batch = batch.cuda()
- sample = {
- "net_input": {
- "src_tokens": batch,
- "prev_output_tokens": batch,
- "src_lengths": torch.tensor(
- [len(tok)] * batch_size, dtype=torch.long, device=batch.device
- ),
- },
- "target": batch[:, 1:],
- }
- return sample
- def cpu_gpu(fn):
- def helper(self):
- fn(self, "cpu")
- if torch.cuda.is_available():
- fn(self, "gpu")
- return helper
- def architectures(fn):
- def helper(self):
- for arch in ["roberta_enc_dec", "transformer"]:
- fn(self, arch)
- return helper
- class RobertaTest(unittest.TestCase):
- def assertTensorEqual(self, t1, t2, delta: float = 1e-6):
- self.assertEqual(t1.size(), t2.size(), "size mismatch")
- if delta == 0.0:
- self.assertEqual(t1.ne(t2).long().sum(), 0)
- else:
- self.assertEqual(((t2 - t1).abs() > delta).long().sum(), 0)
- def assertSharing(self, model, link_groups: Sequence[Sequence[str]]):
- ids = {}
- for group in link_groups:
- group_ids = {name: id(params(model, name)) for name in group}
- shared_id = group_ids[group[0]]
- self.assertEqual(group_ids, {name: shared_id for name in group})
- self.assertNotIn(shared_id, ids)
- ids[shared_id] = group
- def test_roberta_shared_params(self):
- _, roberta = get_toy_model("cpu", architecture="roberta")
- self.assertSharing(
- roberta,
- [
- [
- "encoder.sentence_encoder.embed_tokens.weight",
- "encoder.lm_head.weight",
- ]
- ],
- )
- _, roberta = get_toy_model(
- "cpu", architecture="roberta", untie_weights_roberta=True
- )
- self.assertSharing(
- roberta,
- [
- ["encoder.sentence_encoder.embed_tokens.weight"],
- ["encoder.lm_head.weight"],
- ],
- )
- def test_roberta_enc_dec_shared_params(self):
- # 3 distinct embeddings
- _, enc_dec = get_toy_model("cpu", architecture="roberta_enc_dec")
- self.assertSharing(
- enc_dec,
- [
- ["encoder.embed_tokens.weight"],
- ["decoder.embed_tokens.weight"],
- ["decoder.output_projection.weight"],
- ],
- )
- # 2 distinct embeddings, one for encoder, one for decoder
- _, enc_dec = get_toy_model(
- "cpu", architecture="roberta_enc_dec", share_decoder_input_output_embed=True
- )
- self.assertSharing(
- enc_dec,
- [
- ["encoder.embed_tokens.weight"],
- [
- "decoder.embed_tokens.weight",
- "decoder.output_projection.weight",
- ],
- ],
- )
- # shared embeddings
- _, enc_dec = get_toy_model(
- "cpu", architecture="roberta_enc_dec", share_all_embeddings=True
- )
- self.assertSharing(
- enc_dec,
- [
- [
- "encoder.embed_tokens.weight",
- "decoder.embed_tokens.weight",
- "decoder.output_projection.weight",
- ]
- ],
- )
- def test_roberta_max_positions_is_correctly_set(self):
- device = "cpu"
- task, model = get_toy_model(device)
- max_pos = model.max_decoder_positions()
- self.assertEqual(max_pos, 256)
- self.assertEqual(max_pos, model.decoder.max_positions())
- self.assertEqual(max_pos, model.encoder.max_positions())
- self.assertEqual(max_pos, model.encoder.embed_positions.max_positions)
- sentence = [31 for _ in range(max_pos)]
- sample = mk_sample("en", device, sentence, batch_size=1)
- self.assertEqual(list(sample["net_input"]["src_lengths"]), [max_pos])
- self.assertEqual(len(sample["net_input"]["src_tokens"][0]), max_pos)
- x, _ = model.forward(**sample["net_input"])
- self.assertEqual(x.shape, (1, max_pos, VOCAB_SIZE))
- @cpu_gpu
- def test_roberta_forward_backward(self, device: str):
- _, model = get_toy_model(device)
- sample = mk_sample("en", device)
- en_tokens = sample["net_input"]["src_tokens"]
- (bs, l) = en_tokens.shape
- # Forward
- logits, _ = model(**sample["net_input"])
- self.assertEqual(logits.shape, (bs, l, VOCAB_SIZE))
- # Backward
- loss = logits.sum()
- loss.backward()
- @cpu_gpu
- def test_roberta_forward_backward_bs1(self, device: str):
- _, model = get_toy_model(device)
- sample = mk_sample("en", device, batch_size=1)
- o, _ = model.forward(**sample["net_input"])
- loss = o.sum()
- sample2 = mk_sample("ro", device, batch_size=1)
- o, _ = model.forward(**sample2["net_input"])
- loss += o.sum()
- loss.backward()
- @cpu_gpu
- def test_roberta_batching(self, device: str):
- """
- Checks that the batch of size 2 give twice the same results than the batch of size 1.
- """
- _, model = get_toy_model(device)
- sample = mk_sample("en", device, batch_size=1)
- slen = sample["net_input"]["src_lengths"][0]
- sample2 = mk_sample("en", device, batch_size=2)
- with torch.no_grad():
- z = model.encoder.forward(
- sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"]
- )
- z = z["encoder_out"][-1]
- logits, _ = model.forward(**sample["net_input"])
- z2 = model.encoder.forward(
- sample2["net_input"]["src_tokens"], sample["net_input"]["src_lengths"]
- )
- z2 = z2["encoder_out"][-1]
- logits2, _ = model.forward(**sample2["net_input"])
- self.assertEqual(z.shape, (slen, 1, 12))
- self.assertEqual(z2.shape, (slen, 2, 12))
- self.assertTensorEqual(logits2[0], logits2[1])
- self.assertTensorEqual(logits[0], logits2[0])
- @cpu_gpu
- def test_roberta_incremental_decoder(self, device: str):
- """
- Checks that incremental decoding yields the same result than non incremental one.
- """
- task, model = get_toy_model(device)
- en_sample = mk_sample("en", device)
- en_tokens = en_sample["net_input"]["src_tokens"]
- ro_sample = mk_sample("ro", device)
- ro_tokens = ro_sample["net_input"]["src_tokens"]
- en_enc = model.encoder.forward(
- en_tokens, src_lengths=en_sample["net_input"]["src_lengths"]
- )
- (bs, tgt_len) = ro_tokens.shape
- # Decode without incremental state
- ro_dec, _ = model.decoder.forward(ro_tokens, encoder_out=en_enc)
- self.assertEqual(ro_dec.shape, (bs, tgt_len, VOCAB_SIZE))
- self.assertTensorEqual(ro_dec[0], ro_dec[1])
- # Decode with incremental state
- inc_state = {}
- ro_dec_inc = []
- for i in range(tgt_len):
- ro, _ = model.decoder.forward(
- ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state
- )
- self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE))
- ro_dec_inc.append(ro)
- for i in range(tgt_len):
- # Intra-batch
- self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1])
- # Incremental vs non-incremental
- self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i])
- @cpu_gpu
- def test_regularize_for_adaprune_in_roberta(self, device: str):
- _, model = get_toy_model(
- device=device,
- architecture="roberta_base",
- mha_reg_scale_factor=0.000375,
- ffn_reg_scale_factor=0.000375,
- )
- sample = mk_sample("en", device, batch_size=1)
- task_loss, _ = model.forward(**sample["net_input"])
- head_loss = model._get_adaptive_head_loss()
- ffn_loss = model._get_adaptive_ffn_loss()
- loss = task_loss.sum() + head_loss + ffn_loss
- loss.backward()
- @cpu_gpu
- def test_ffn_prune_for_adaprune_in_roberta(self, device: str):
- _, model = get_toy_model(
- device=device,
- architecture="roberta_base",
- )
- sample = mk_sample("en", device, batch_size=1)
- for layer in model.encoder.sentence_encoder.layers:
- fc1_original_size = layer.fc1.out_features
- remove_index = layer._get_fc_rank(remove_num=2)
- layer._prune_fc_layer(remove_index=remove_index)
- self.assertEqual(layer.fc1.out_features, fc1_original_size - 2)
- task_loss, _ = model.forward(**sample["net_input"])
- def params(model, name):
- if "." not in name:
- return getattr(model, name)
- prefix, name = name.split(".", 1)
- return params(getattr(model, prefix), name)
|