Skip to content

Commit

Permalink
Extend and document the test for dpr with non-bert tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
julian-risch committed Jun 2, 2021
1 parent 399f502 commit 4af4b3d
Showing 1 changed file with 163 additions and 13 deletions.
176 changes: 163 additions & 13 deletions test/test_dpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import logging
import numpy as np
from pathlib import Path

from torch.utils.data import SequentialSampler
from tqdm import tqdm

from farm.data_handler.dataloader import NamedDataLoader
from farm.data_handler.processor import TextSimilarityProcessor
from farm.data_handler.data_silo import DataSilo
from farm.train import Trainer
Expand Down Expand Up @@ -445,6 +450,12 @@ def test_dpr_processor_save_load():


def test_dpr_processor_save_load_non_bert_tokenizer():
"""
This test compares 1) a model that was loaded from model hub with
2) a model from model hub that was saved to disk and then loaded from disk and
3) a model in FARM style that was saved to disk and then loaded from disk
"""

d = {'query': "Comment s'appelle le portail open data du gouvernement?",
'passages': [
{'title': 'Etalab',
Expand All @@ -457,7 +468,8 @@ def test_dpr_processor_save_load_non_bert_tokenizer():
# load model from model hub
query_embedding_model = "etalab-ia/dpr-question_encoder-fr_qa-camembert"
passage_embedding_model = "etalab-ia/dpr-ctx_encoder-fr_qa-camembert"
query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=query_embedding_model) #tokenizer class is inferred automatically
query_tokenizer = Tokenizer.load(
pretrained_model_name_or_path=query_embedding_model) # tokenizer class is inferred automatically
query_encoder = LanguageModel.load(pretrained_model_name_or_path=query_embedding_model,
language_model_class="DPRQuestionEncoder")
passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model)
Expand Down Expand Up @@ -490,7 +502,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer():
)
model.connect_heads_with_processor(processor.tasks, require_labels=False)

# save model to disk
# save model that was loaded from model hub to disk
save_dir = "testsave/dpr_model"
query_encoder_dir = "query_encoder"
passage_encoder_dir = "passage_encoder"
Expand All @@ -500,23 +512,24 @@ def test_dpr_processor_save_load_non_bert_tokenizer():

# load model from disk
loaded_query_tokenizer = Tokenizer.load(
pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True) # tokenizer class is inferred automatically
pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir,
use_fast=True) # tokenizer class is inferred automatically
loaded_query_encoder = LanguageModel.load(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir,
language_model_class="DPRQuestionEncoder")
language_model_class="DPRQuestionEncoder")
loaded_passage_tokenizer = Tokenizer.load(
pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True)
loaded_passage_encoder = LanguageModel.load(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir,
language_model_class="DPRContextEncoder")
language_model_class="DPRContextEncoder")

loaded_processor = TextSimilarityProcessor(query_tokenizer=loaded_query_tokenizer,
passage_tokenizer=loaded_passage_tokenizer,
max_seq_len_passage=256,
max_seq_len_query=256,
label_list=["hard_negative", "positive"],
metric="text_similarity_metric",
embed_title=True,
num_hard_negatives=0,
num_positives=1)
passage_tokenizer=loaded_passage_tokenizer,
max_seq_len_passage=256,
max_seq_len_query=256,
label_list=["hard_negative", "positive"],
metric="text_similarity_metric",
embed_title=True,
num_hard_negatives=0,
num_positives=1)
loaded_prediction_head = TextSimilarityHead(similarity_function="dot_product")

if torch.cuda.is_available():
Expand All @@ -539,6 +552,143 @@ def test_dpr_processor_save_load_non_bert_tokenizer():
dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
assert np.array_equal(dataset.tensors[0], dataset2.tensors[0])

# generate embeddings with model loaded from model hub
dataset, tensor_names, _, baskets = processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
)

data_loader = NamedDataLoader(
dataset=dataset, sampler=SequentialSampler(dataset), batch_size=16, tensor_names=tensor_names
)
all_embeddings = {"query": [], "passages": []}
model.eval()

for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
batch = {key: batch[key].to(device) for key in batch}

# get logits
with torch.no_grad():
query_embeddings, passage_embeddings = model.forward(**batch)[0]
if query_embeddings is not None:
all_embeddings["query"].append(query_embeddings.cpu().numpy())
if passage_embeddings is not None:
all_embeddings["passages"].append(passage_embeddings.cpu().numpy())

if all_embeddings["passages"]:
all_embeddings["passages"] = np.concatenate(all_embeddings["passages"])
if all_embeddings["query"]:
all_embeddings["query"] = np.concatenate(all_embeddings["query"])

# generate embeddings with model loaded from disk
dataset2, tensor_names2, _, baskets2 = loaded_processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
)

data_loader = NamedDataLoader(
dataset=dataset2, sampler=SequentialSampler(dataset2), batch_size=16, tensor_names=tensor_names2
)
all_embeddings2 = {"query": [], "passages": []}
loaded_model.eval()

for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
batch = {key: batch[key].to(device) for key in batch}

# get logits
with torch.no_grad():
query_embeddings, passage_embeddings = loaded_model.forward(**batch)[0]
if query_embeddings is not None:
all_embeddings2["query"].append(query_embeddings.cpu().numpy())
if passage_embeddings is not None:
all_embeddings2["passages"].append(passage_embeddings.cpu().numpy())

if all_embeddings2["passages"]:
all_embeddings2["passages"] = np.concatenate(all_embeddings2["passages"])
if all_embeddings2["query"]:
all_embeddings2["query"] = np.concatenate(all_embeddings2["query"])

# compare embeddings of model loaded from model hub and model loaded from disk
assert np.array_equal(all_embeddings["query"][0], all_embeddings2["query"][0])

# save the model that was loaded from disk to disk
save_dir = "testsave/dpr_model"
query_encoder_dir = "query_encoder"
passage_encoder_dir = "passage_encoder"
loaded_model.save(Path(save_dir), lm1_name=query_encoder_dir, lm2_name=passage_encoder_dir)
loaded_query_tokenizer.save_pretrained(save_dir + f"/{query_encoder_dir}")
loaded_passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}")

# load model from disk
query_tokenizer = Tokenizer.load(
pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir) # tokenizer class is inferred automatically
query_encoder = LanguageModel.load(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir,
language_model_class="DPRQuestionEncoder")
passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir)
passage_encoder = LanguageModel.load(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir,
language_model_class="DPRContextEncoder")

processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
passage_tokenizer=passage_tokenizer,
max_seq_len_passage=256,
max_seq_len_query=256,
label_list=["hard_negative", "positive"],
metric="text_similarity_metric",
embed_title=True,
num_hard_negatives=0,
num_positives=1)
prediction_head = TextSimilarityHead(similarity_function="dot_product")

if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
model = BiAdaptiveModel(
language_model1=query_encoder,
language_model2=passage_encoder,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm1_output_types=["per_sequence"],
lm2_output_types=["per_sequence"],
device=device,
)
model.connect_heads_with_processor(processor.tasks, require_labels=False)

# compare a model loaded from disk that originated from the model hub and was then saved disk with
# a model loaded from disk that also originated from a FARM style model that was saved to disk
dataset3, tensor_names3, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False)
dataset2, tensor_names2, _ = loaded_processor.dataset_from_dicts(dicts=[d], return_baskets=False)
assert np.array_equal(dataset3.tensors[0], dataset2.tensors[0])

# generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier
dataset3, tensor_names3, _, baskets3 = loaded_processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
)

data_loader = NamedDataLoader(
dataset=dataset3, sampler=SequentialSampler(dataset3), batch_size=16, tensor_names=tensor_names3
)
all_embeddings3 = {"query": [], "passages": []}
loaded_model.eval()

for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)):
batch = {key: batch[key].to(device) for key in batch}

# get logits
with torch.no_grad():
query_embeddings, passage_embeddings = loaded_model.forward(**batch)[0]
if query_embeddings is not None:
all_embeddings3["query"].append(query_embeddings.cpu().numpy())
if passage_embeddings is not None:
all_embeddings3["passages"].append(passage_embeddings.cpu().numpy())

if all_embeddings3["passages"]:
all_embeddings3["passages"] = np.concatenate(all_embeddings3["passages"])
if all_embeddings3["query"]:
all_embeddings3["query"] = np.concatenate(all_embeddings3["query"])

# compare embeddings of model loaded from model hub and model loaded from disk that originated from a FARM style
# model that was saved to disk earlier
assert np.array_equal(all_embeddings["query"][0], all_embeddings3["query"][0])

def test_dpr_training():
batch_size = 1
n_epochs = 1
Expand Down

0 comments on commit 4af4b3d

Please # to comment.