Notebook

Loading [MathJax]/extensions/Safe.js

#!pip install jaal pandas networkx matplotlib snap-stanford
!pip install snap-stanford
!pip install networkx

# Hide errors due to unknown chars!
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import re
import json
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

# Quarry query for user talk page relations

# User only relations t1 and t2 columns
# https://quarry.wmcloud.org/query/60589
# https://quarry.wmcloud.org/run/600192/output/0/csv

# User and project relations, t1, t1ns, t2, t2ns
# https://quarry.wmcloud.org/query/60592
# https://quarry.wmcloud.org/run/600233/output/0/csv

CSV_URL = 'https://quarry.wmcloud.org/run/600233/output/0/csv'

# Load the data
df = pd.read_csv(CSV_URL)
print(df)

# Filter & sort data
# - Remove IPs
# - Find the most linked nodes
# - Filter out some of the cruft
ipv4 = r"\d+\.\d+\.\d+\.\d+"
ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"

# Remove IPs
def hasip(row):
    # true to keep, if no IP matches
    return re.match(ipv4, row["t1"]) is None and re.match(ipv4, row["t2"]) is None and re.match(ipv6, row["t1"]) is None and re.match(ipv6, row["t2"]) is None
df_filtered = df[df.apply(hasip, axis=1)]

# Count value occourance & give every value an int
values = list(df_filtered['t1'].values) + list(df_filtered['t2'].values)
counted_values = Counter(values)
#unique_values_list = list(counted_values.keys())
#val_to_int = {}
#for i in range(0, len(unique_values_list)) :
#    val_to_int[unique_values_list[i]] = i
    
#print(counted_values)

one_must_have = 200
both_must_have = 25
one_must_not_have = 999999999
def hasEnoughRefs(row):
    t1Count = counted_values[row["t1"]]# Count of times t1 is involved in a link
    t2Count = counted_values[row["t2"]]# Count of times t2 is involved in a link
    # true to keep
    return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have )
df_1 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)]

print(df_1)

G = nx.from_pandas_edgelist(df_1,source='t1',target='t2')
fig, ax = plt.subplots(1, 1, figsize=(100, 100));
# https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html
nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style="dashed",width=0.5,edge_color='slategrey')

one_must_have = 900
both_must_have = 10
one_must_not_have = 999999999
def hasEnoughRefs(row):
    t1Count = counted_values[row["t1"]]# Count of times t1 is involved in a link
    t2Count = counted_values[row["t2"]]# Count of times t2 is involved in a link
    # true to keep
    return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have )
df_2 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)]

print(df_2)

G = nx.from_pandas_edgelist(df_2,source='t1',target='t2')
fig, ax = plt.subplots(1, 1, figsize=(100, 100));
# https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html
nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style="dashed",width=0.5,edge_color='slategrey')

one_must_have = 1500
both_must_have = 5
one_must_not_have = 999999999
def hasEnoughRefs(row):
    t1Count = counted_values[row["t1"]]# Count of times t1 is involved in a link
    t2Count = counted_values[row["t2"]]# Count of times t2 is involved in a link
    # true to keep
    return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have )
df_2 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)]

print(df_2)

G = nx.from_pandas_edgelist(df_2,source='t1',target='t2')
fig, ax = plt.subplots(1, 1, figsize=(100, 100));
# https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html
nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style="dashed",width=0.5,edge_color='slategrey')