You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi,
In your paper, you said the "next-utterance classification" task and "language modeling" task were trained in a multi-task learning setting, and also in train.py, there is a function load the persona dataset as described. However, there is a logical gap that I can not fully understand that is you also trained the wrong candidates (i.e. replies) for language modeling task. Can anyone explain this to me?
def get_data_loaders(args, tokenizer):
""" Prepare the dataset for training and evaluation """
personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
logger.info("Build inputs and labels")
datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
for dataset_name, dataset in personachat.items():
num_candidates = len(dataset[0]["utterances"][0]["candidates"])
if args.num_candidates > 0 and dataset_name == 'train':
num_candidates = min(args.num_candidates, num_candidates)
for dialog in dataset:
persona = dialog["personality"].copy()
for _ in range(args.personality_permutations):
for utterance in dialog["utterances"]:
history = utterance["history"][-(2*args.max_history+1):]
for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
lm_labels = bool(j == num_candidates-1)
instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
for input_name, input_array in instance.items():
datasets[dataset_name][input_name].append(input_array)
datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
datasets[dataset_name]["n_candidates"] = num_candidates
persona = [persona[-1]] + persona[:-1] # permuted personalities
logger.info("Pad inputs and convert to Tensor")
tensor_datasets = {"train": [], "valid": []}
for dataset_name, dataset in datasets.items():
dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
for input_name in MODEL_INPUTS:
tensor = torch.tensor(dataset[input_name])
if input_name != "mc_labels":
tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
tensor_datasets[dataset_name].append(tensor)
logger.info("Build train and validation dataloaders")
train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed))
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)
logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
return train_loader, valid_loader, train_sampler, valid_sampler
The text was updated successfully, but these errors were encountered:
Hi,
In your paper, you said the "next-utterance classification" task and "language modeling" task were trained in a multi-task learning setting, and also in train.py, there is a function load the persona dataset as described. However, there is a logical gap that I can not fully understand that is you also trained the wrong candidates (i.e. replies) for language modeling task. Can anyone explain this to me?
The text was updated successfully, but these errors were encountered: