# imports
import csv
import numpy as np
import os
import rdflib
import pandas as pd
from sklearn.metrics import pairwise_distances
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')
TODO: Set the correct path to the movie graph file
Download the movie graph file from https://files.ifi.uzh.ch/ddis/teaching/2021/ATAI/dataset/
# load the graph
graph = rdflib.Graph().parse('ddis-movie-graph.nt', format='turtle')
# load the embeddings
entity_emb = np.load('entity_embeds.npy')
relation_emb = np.load('relation_embeds.npy')
# load the dictionaries
with open('entity_ids.del', 'r') as ifile:
ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
id2ent = {v: k for k, v in ent2id.items()}
with open('relation_ids.del', 'r') as ifile:
rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
id2rel = {v: k for k, v in rel2id.items()}
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}
# number of triples in the graph
len(graph)
# number of entities in the graph
triples = {(s, p, o) for s,p,o in graph.triples((None, None, None)) if isinstance(o, rdflib.term.URIRef)}
len({s for s,p,o in triples} | {o for s,p,o in triples})
# entity embedding size
entity_emb.shape
# relation embedding size
relation_emb.shape
# let's see what our graph thinks the occupation of Jean Van Hamme is
professions = set(graph.query('''
prefix wdt: <http://www.wikidata.org/prop/direct/>
prefix wd: <http://www.wikidata.org/entity/>
SELECT ?obj ?lbl WHERE {
?ent rdfs:label "Jean Van Hamme"@en .
?ent wdt:P106 ?obj .
?obj rdfs:label ?lbl .
}
'''))
{ent[len(WD):]: str(lbl) for ent, lbl in professions}
# "Jean Van Hamme" entity
head = entity_emb[ent2id[WD.Q428160]]
# "occupation" relation
pred = relation_emb[rel2id[WDT.P106]]
# add vectors according to TransE scoring function.
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible entities
most_likely = dist.argsort()
# compute ranks of entities
ranks = dist.argsort().argsort()
# show scores for (Jean Van Hamme, occupation, butcher)
pd.DataFrame([(str(lbl), dist[ent2id[ent]], ranks[ent2id[ent]]) for ent, lbl in professions],
columns=('Occupation', 'Score', 'Rank'))
# what would be more plausible occupations?
pd.DataFrame([
(id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
for rank, idx in enumerate(most_likely[:10])],
columns=('Entity', 'Label', 'Score', 'Rank'))
# which entities are similar to "Harry Potter and the Goblet of Fire"
ent = ent2id[WD.Q102225]
# we compare the embedding of the query entity to all other entity embeddings
dist = pairwise_distances(entity_emb[ent].reshape(1, -1), entity_emb).reshape(-1)
# order by plausibility
most_likely = dist.argsort()
pd.DataFrame([
(
id2ent[idx][len(WD):], # qid
ent2lbl[id2ent[idx]], # label
dist[idx], # score
rank+1, # rank
)
for rank, idx in enumerate(most_likely[:15])],
columns=('Entity', 'Label', 'Score', 'Rank'))
# hmm, our graph contains no parent class of bridge (Q12280)...
set(graph.objects(WD.Q12280, WDT.P279))
# maybe an indirect subclass?
set(graph.objects(WD.Q12280, DDIS.indirectSubclassOf))
# Let's see if we can recover this from embeddings...
# set the head entity to bridge
head = entity_emb[ent2id[WD.Q12280]]
# let's try with wdt:P279 (subClassOf) first
pred = relation_emb[rel2id[WDT.P279]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()
# show most likely entities
pd.DataFrame([
(id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
for rank, idx in enumerate(most_likely[:10])],
columns=('Entity', 'Label', 'Score', 'Rank'))
# ... didn't really help.
# Let's try ddis:indirectSubclassOf next
# set the head entity to bridge
head = entity_emb[ent2id[WD.Q12280]]
# now we try ddis:indirectSubclassOf
pred = relation_emb[rel2id[DDIS.indirectSubclassOf]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()
# show most likely entities
pd.DataFrame([
(id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
for rank, idx in enumerate(most_likely[:10])],
columns=('Entity', 'Label', 'Score', 'Rank'))