Embeddings for the DDIS Movie Graph

Setup

In [1]:
# imports
import csv
import numpy as np
import os
import rdflib
import pandas as pd
from sklearn.metrics import pairwise_distances
In [2]:
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

Load the data

TODO: Set the correct path to the movie graph file

Download the movie graph file from https://files.ifi.uzh.ch/ddis/teaching/2021/ATAI/dataset/

In [3]:
# load the graph
graph = rdflib.Graph().parse('ddis-movie-graph.nt', format='turtle')
In [4]:
# load the embeddings
entity_emb = np.load('entity_embeds.npy')
relation_emb = np.load('relation_embeds.npy')
In [5]:
# load the dictionaries
with open('entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}
In [6]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

Inspect the data

In [7]:
# number of triples in the graph
len(graph)
Out[7]:
2056777
In [8]:
# number of entities in the graph
triples = {(s, p, o) for s,p,o in graph.triples((None, None, None)) if isinstance(o, rdflib.term.URIRef)}
len({s for s,p,o in triples} | {o for s,p,o in triples})
Out[8]:
158900
In [9]:
# entity embedding size
entity_emb.shape
Out[9]:
(158901, 256)
In [10]:
# relation embedding size
relation_emb.shape
Out[10]:
(248, 256)

Finding errors

In [11]:
# let's see what our graph thinks the occupation of Jean Van Hamme is
professions = set(graph.query('''
    prefix wdt: <http://www.wikidata.org/prop/direct/>
    prefix wd: <http://www.wikidata.org/entity/>
    
    SELECT ?obj ?lbl WHERE {
        ?ent rdfs:label "Jean Van Hamme"@en .
        ?ent wdt:P106 ?obj .
        ?obj rdfs:label ?lbl .
    }
    '''))
{ent[len(WD):]: str(lbl) for ent, lbl in professions}
Out[11]:
{'Q329737': 'butcher'}
In [12]:
# "Jean Van Hamme" entity
head = entity_emb[ent2id[WD.Q428160]]
# "occupation" relation
pred = relation_emb[rel2id[WDT.P106]]
# add vectors according to TransE scoring function.
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible entities
most_likely = dist.argsort()
# compute ranks of entities
ranks = dist.argsort().argsort()
In [13]:
# show scores for (Jean Van Hamme, occupation, butcher)
pd.DataFrame([(str(lbl), dist[ent2id[ent]], ranks[ent2id[ent]]) for ent, lbl in professions],
        columns=('Occupation', 'Score', 'Rank'))
Out[13]:
Occupation Score Rank
0 butcher 5196.63623 86817
In [14]:
# what would be more plausible occupations?
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))
Out[14]:
Entity Label Score Rank
0 Q36180 writer 3223.151367 1
1 Q33999 actor 3382.129883 2
2 Q6625963 novelist 3397.582520 3
3 Q1930187 journalist 3492.735352 4
4 Q4610556 model 3549.961182 5
5 Q639669 musician 3616.397949 6
6 Q67311526 Obalky knih.cz 3628.230957 7
7 Q150 French 3667.630615 8
8 Q1028181 painter 3694.667480 9
9 Q245068 comedian 3744.303955 10

Entity Similarity

In [15]:
# which entities are similar to "Harry Potter and the Goblet of Fire"
ent = ent2id[WD.Q102225]
# we compare the embedding of the query entity to all other entity embeddings
dist = pairwise_distances(entity_emb[ent].reshape(1, -1), entity_emb).reshape(-1)
# order by plausibility
most_likely = dist.argsort()

pd.DataFrame([
    (
        id2ent[idx][len(WD):], # qid
        ent2lbl[id2ent[idx]],  # label
        dist[idx],             # score
        rank+1,                # rank
    )
    for rank, idx in enumerate(most_likely[:15])],
    columns=('Entity', 'Label', 'Score', 'Rank'))
Out[15]:
Entity Label Score Rank
0 Q102225 Harry Potter and the Goblet of Fire 0.000000 1
1 Q102235 Harry Potter and the Order of the Phoenix 1839.364746 2
2 Q161678 Harry Potter and the Deathly Hallows – Part 1 1864.835327 3
3 Q161687 Harry Potter and the Half-Blood Prince 1895.976562 4
4 Q232009 Harry Potter and the Deathly Hallows – Part 2 1941.686646 5
5 Q102244 Harry Potter and the Chamber of Secrets 2000.969482 6
6 Q102438 Harry Potter and the Philosopher's Stone 2059.114502 7
7 Q102448 Harry Potter and the Prisoner of Azkaban 2140.694092 8
8 Q18199330 Fantastic Beasts and Where to Find Them 2350.040771 9
9 Q1880543 Harry Potter and the Deathly Hallows 2419.553467 10
10 Q216930 Harry Potter film series 2449.367432 11
11 Q849901 Harry Potter and the Goblet of Fire 2449.732178 12
12 Q18199331 Fantastic Beasts: The Crimes of Grindelwald 2490.106934 13
13 Q20735644 Me Before You 2509.801758 14
14 Q28146833 Severus Snape and the Marauders 2509.966797 15

Recovering categories

In [16]:
# hmm, our graph contains no parent class of bridge (Q12280)...
set(graph.objects(WD.Q12280, WDT.P279))
Out[16]:
set()
In [17]:
# maybe an indirect subclass?
set(graph.objects(WD.Q12280, DDIS.indirectSubclassOf))
Out[17]:
set()
In [18]:
# Let's see if we can recover this from embeddings...

# set the head entity to bridge
head = entity_emb[ent2id[WD.Q12280]]
# let's try with wdt:P279 (subClassOf) first
pred = relation_emb[rel2id[WDT.P279]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()
# show most likely entities
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))
Out[18]:
Entity Label Score Rank
0 Q12280 bridge 3145.157227 1
1 Q61457040 Ramsar site in Australia 5208.453125 2
2 Q6502866 cliffed coast 5241.154785 3
3 Q595452 baseball venue 5261.777832 4
4 Q19368170 Pont d'en Gómez 5276.720215 5
5 Q2463705 Special Protection Area 5281.016113 6
6 Q2066754 manor 5301.109863 7
7 Q17468479 district of Oulu 5323.046875 8
8 Q1049757 multi-purpose stadium 5326.457520 9
9 Q202570 Ferris wheel 5341.973633 10
In [19]:
# ... didn't really help.
# Let's try ddis:indirectSubclassOf next

# set the head entity to bridge
head = entity_emb[ent2id[WD.Q12280]]
# now we try ddis:indirectSubclassOf
pred = relation_emb[rel2id[DDIS.indirectSubclassOf]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()
# show most likely entities
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))
Out[19]:
Entity Label Score Rank
0 Q12280 bridge 4617.544922 1
1 Q27096213 geographic entity 5803.001465 2
2 Q95074 fictional character 6175.950195 3
3 Q19368170 Pont d'en Gómez 6219.216797 4
4 Q2282230 River Kwai bridge 6309.948242 5
5 Q1323635 Petit Pont 6325.951660 6
6 Q6502866 cliffed coast 6327.051270 7
7 Q1497364 building complex 6341.214355 8
8 Q2151232 townland 6382.717773 9
9 Q2080521 market hall 6385.139160 10