Knowledge Graphs in Python

knowledge Graphs are a way to visualize relationships between entities, they can be helpful in visualizing a relationship and making it simpler to understand.

the data used is collection of sentences extracted from wikipedia.

import reimport pandas as pdimport spacyfrom spacy import displacynlp = spacy.load('en_core_web_sm')from spacy.matcher import Matcherimport networkx as nximport matplotlib.pyplot as pltfrom tqdm import tqdmpd.set_option('display.max_colwidth', 200)%matplotlib inline

df = pd.read_csv("https://raw.githubusercontent.com/phgunawan/Latihan-ML/master/wiki_sentences_v2.csv")

df

	sentence
0	confused and frustrated, connie decides to leave on her own.
1	later, a woman’s scream is heard in the distance.
2	christian is then paralyzed by an elder.
3	the temple is set on fire.
4	outside, the cult wails with him.
...	...
4313	confidencial also responded negatively, calling the film a barren drama, unsubtle and self-indulgent.
4314	and le parisien gave the film their highest five-star rating.
4315	the museum collection includes 37,000 film titles, 60,000 posters, 700,000 photographs and 20,000 books.
4316	its predecessor was the dutch historical film archive, founded in 1946.
4317	, 1920'sfilmstar greta garbo by alexander binder,

4318 rows × 1 columns

# detect dependenciesdoc = nlp(df['sentence'][0])print(f"{'Token':20}Dependency")print('=' * 30)for token in doc:    print(f"{token.text:20}{token.dep_}")

Token               Dependency==============================confused            advcland                 ccfrustrated          conj,                   punctconnie              nsubjdecides             ROOTto                  auxleave               xcompon                  prepher                 possown                 pobj.                   punct

def get_entities(sent):    """    extract the subject and the object (entities) from a sentence while also overcoming the challenges faced by spacy.    Parameters    ----------    sent: string        Sentence to get entities for.    Returns    -------    List        list containing subject and object.    """    ## chunk 1    ent1 = ""    ent2 = ""    prv_tok_dep = ""    # dependency tag of previous token in the sentence    prv_tok_text = ""   # previous token in the sentence    prefix = ""    modifier = ""    for tok in nlp(sent):      ## chunk 2      # if token is a punctuation mark then move on to the next token      if tok.dep_ != "punct":        # check: token is a compound word or not        if tok.dep_ == "compound":          prefix = tok.text          # if the previous word was also a 'compound' then add the current word to it          if prv_tok_dep == "compound":            prefix = prv_tok_text + " " + tok.text        # check: token is a modifier or not        if tok.dep_.endswith("mod") == True:          modifier = tok.text          # if the previous word was also a 'compound' then add the current word to it          if prv_tok_dep == "compound":            modifier = prv_tok_text + " " + tok.text        ## chunk 3        if tok.dep_.find("subj") == True:          ent1 = modifier +" "+ prefix + " " + tok.text          prefix = ""          modifier = ""          prv_tok_dep = ""          prv_tok_text = ""        ## chunk 4        if tok.dep_.find("obj") == True:          ent2 = modifier + " " + prefix + " " + tok.text        ## chunk 5        # update variables        prv_tok_dep = tok.dep_        prv_tok_text = tok.text    return [ent1.strip(), ent2.strip()]

df['sentence'][5]

"it's a parable of a woman's religious awakening—"

get_entities(df['sentence'][5])

['it', 'religious  awakening']

entity_pairs = []for i in tqdm(df["sentence"]):  entity_pairs.append(get_entities(i))

100%|██████████| 4318/4318 [00:38<00:00, 111.34it/s]

def get_relation(sent):    """    Get relation between sentence entities.    Parameters    ----------    sent: string        Sentence to get relations for.    Returns    ------    string        Relations in sent.    """    doc = nlp(sent)    # Matcher class object    matcher = Matcher(nlp.vocab)    #define the pattern    pattern = [{'DEP':'ROOT'},                {'DEP':'prep','OP':"?"},                {'DEP':'agent','OP':"?"},                {'POS':'ADJ','OP':"?"}]    matcher.add("matching_1", None, pattern)    matches = matcher(doc)    k = len(matches) - 1    span = doc[matches[k][1] : matches[k][2]]    return(span.text)

relations = [get_relation(i) for i in tqdm(df['sentence'])]

100%|██████████| 4318/4318 [00:37<00:00, 115.69it/s]

# building the graph# extract subjectsource = [i[0] for i in entity_pairs]# extract objecttarget = [i[1] for i in entity_pairs]kg_df = pd.DataFrame({'source' : source, 'target' : target, 'edge' : relations})

kg_df

	source	target	edge
0	connie	own	decides
1	later woman	distance	heard in
2	christian	then elder	paralyzed by
3	temple	fire	set on
4	outside cult	him	wails with
...	...	...	...
4313	confidencial	negatively film	responded
4314	le parisien	five star rating	gave
4315	museum collection	37,000 film titles	includes
4316	predecessor	historical film 1946	was
4317		1920'sfilmstar alexander binder	garbo by

4318 rows × 3 columns

# create a directed-graph from a dataframegraph = nx.from_pandas_edgelist(kg_df, "source", "target",                          edge_attr=True, create_using=nx.MultiDiGraph())plt.figure(figsize=(12,12))pos = nx.spring_layout(graph)nx.draw(graph, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)plt.show()

png

that took 2 minutes because there are a lot of relations to visualize, lets write a function to visualize a specific relation.

def plot_relation(relation):    """    Plot relationship between entities given relationship.    Parameters    ----------    relation: string        The relationship to plot in kg_df.    """    com_graph = nx.from_pandas_edgelist(kg_df[kg_df['edge']==relation], "source", "target",                          edge_attr=True, create_using=nx.MultiDiGraph())    plt.figure(figsize=(12,12))    pos = nx.spring_layout(com_graph, k = 0.5) # k regulates the distance between nodes    nx.draw(com_graph, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos= pos)    plt.show()

# get most common relationskg_df['edge'].value_counts()[:20]

is             370was            297released on     87include         73were            71are             71released        40's              38composed by     35have            31has             31became          31become          29released in     27included        26produced        22called          22considered      20made            20had             20Name: edge, dtype: int64

# plot some specific relationsplot_relation("were")

png

# plot some specific relationsplot_relation("had")

png

plot_relation("released in")

png

plot_relation("released on")

png

Knowledge Graphs in Python

4 min read.

You might also like

Varo Believe to Monarch: Automated PDF Statement Conversion for Monarch Money

structx: Type-Safe Structured Data Extraction from Any Document Using LLMs

scrapy-llm: Schema-Driven AI Web Scraping as a Scrapy Middleware

Wallhaven Wallpaper Reborn: A Full-Featured KDE Plasma 6 Wallpaper Plugin