from vespa.application import Vespa
from learntorank.query import QueryModel, Ranking, OR
= Vespa(url = "https://api.cord19.vespa.ai")
app = QueryModel(
query_model = OR(),
match_phase = Ranking(name="bm25", list_features=True)
ranking )
Collect training data from application
Collect training data to analyse and/or improve ranking functions
Example setup
Connect to the application and define a query model.
Define some labeled data.
= [
labeled_data
{"query_id": 0,
"query": "Intrauterine virus infections and congenital heart disease",
"relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
},
{"query_id": 1,
"query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
"relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
} ]
Collect training data in batch
from learntorank.query import collect_vespa_features
= collect_vespa_features(
training_data_batch =app,
app= labeled_data,
labeled_data = "id",
id_field = query_model,
query_model = 2,
number_additional_docs =["rankfeatures"]
fields
) training_data_batch
document_id | query_id | label | attributeMatch(authors.first) | attributeMatch(authors.first).averageWeight | attributeMatch(authors.first).completeness | attributeMatch(authors.first).fieldCompleteness | attributeMatch(authors.first).importance | attributeMatch(authors.first).matches | attributeMatch(authors.first).maxWeight | ... | textSimilarity(results).fieldCoverage | textSimilarity(results).order | textSimilarity(results).proximity | textSimilarity(results).queryCoverage | textSimilarity(results).score | textSimilarity(title).fieldCoverage | textSimilarity(title).order | textSimilarity(title).proximity | textSimilarity(title).queryCoverage | textSimilarity(title).score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.062500 | 0.0 | 0.0000 | 0.142857 | 0.055357 |
1 | 3 | 0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.142857 | 0.0 | 0.4375 | 0.142857 | 0.224554 |
4 | 255164 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.0000 | 1.000000 | 1.000000 |
5 | 120761 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.300000 | 1.0 | 1.0000 | 0.428571 | 0.688571 |
2 | 1 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.111111 | 0.0 | 0.0000 | 0.083333 | 0.047222 |
3 | 5 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.083333 | 0.0 | 0.0000 | 0.083333 | 0.041667 |
8 | 232555 | 1 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 1.0000 | 1.000000 | 1.000000 |
9 | 13944 | 1 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.187500 | 1.0 | 1.0000 | 0.250000 | 0.612500 |
8 rows × 1038 columns