Collect training data from application

Collect training data to analyse and/or improve ranking functions

Example setup

Connect to the application and define a query model.

from vespa.application import Vespa
from learntorank.query import QueryModel, Ranking, OR

app = Vespa(url = "https://api.cord19.vespa.ai")
query_model = QueryModel(
    match_phase = OR(),
    ranking = Ranking(name="bm25", list_features=True)
)

Define some labeled data.

labeled_data = [
    {
        "query_id": 0, 
        "query": "Intrauterine virus infections and congenital heart disease",
        "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
    },
    {
        "query_id": 1, 
        "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
        "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
    }
]

Collect training data in batch

from learntorank.query import collect_vespa_features

training_data_batch = collect_vespa_features(
    app=app,
    labeled_data = labeled_data,
    id_field = "id",
    query_model = query_model,
    number_additional_docs = 2,
    fields=["rankfeatures"]
)
training_data_batch
document_id query_id label attributeMatch(authors.first) attributeMatch(authors.first).averageWeight attributeMatch(authors.first).completeness attributeMatch(authors.first).fieldCompleteness attributeMatch(authors.first).importance attributeMatch(authors.first).matches attributeMatch(authors.first).maxWeight ... textSimilarity(results).fieldCoverage textSimilarity(results).order textSimilarity(results).proximity textSimilarity(results).queryCoverage textSimilarity(results).score textSimilarity(title).fieldCoverage textSimilarity(title).order textSimilarity(title).proximity textSimilarity(title).queryCoverage textSimilarity(title).score
0 0 0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.062500 0.0 0.0000 0.142857 0.055357
1 3 0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.142857 0.0 0.4375 0.142857 0.224554
4 255164 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.000000 1.0 1.0000 1.000000 1.000000
5 120761 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.300000 1.0 1.0000 0.428571 0.688571
2 1 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.111111 0.0 0.0000 0.083333 0.047222
3 5 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.083333 0.0 0.0000 0.083333 0.041667
8 232555 1 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.000000 1.0 1.0000 1.000000 1.000000
9 13944 1 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.187500 1.0 1.0000 0.250000 0.612500

8 rows × 1038 columns