1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | #!pip install jaal pandas networkx matplotlib snap-stanford !pip install snap-stanford !pip install networkx # Hide errors due to unknown chars! import warnings warnings.filterwarnings('ignore') import pandas as pd import re import json import numpy as np import networkx as nx import matplotlib.pyplot as plt from collections import Counter |
1 2 3 4 5 6 7 8 9 10 11 | # Quarry query for user talk page relations # User only relations t1 and t2 columns # https://quarry.wmcloud.org/query/60589 # https://quarry.wmcloud.org/run/600192/output/0/csv # User and project relations, t1, t1ns, t2, t2ns # https://quarry.wmcloud.org/query/60592 # https://quarry.wmcloud.org/run/600233/output/0/csv CSV_URL = 'https://quarry.wmcloud.org/run/600233/output/0/csv' |
1 2 3 | # Load the data df = pd.read_csv(CSV_URL) print(df) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | # Filter & sort data # - Remove IPs # - Find the most linked nodes # - Filter out some of the cruft ipv4 = r"\d+\.\d+\.\d+\.\d+" ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))" # Remove IPs def hasip(row): # true to keep, if no IP matches return re.match(ipv4, row["t1"]) is None and re.match(ipv4, row["t2"]) is None and re.match(ipv6, row["t1"]) is None and re.match(ipv6, row["t2"]) is None df_filtered = df[df.apply(hasip, axis=1)] # Count value occourance & give every value an int values = list(df_filtered['t1'].values) + list(df_filtered['t2'].values) counted_values = Counter(values) #unique_values_list = list(counted_values.keys()) #val_to_int = {} #for i in range(0, len(unique_values_list)) : # val_to_int[unique_values_list[i]] = i #print(counted_values) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | one_must_have = 200 both_must_have = 25 one_must_not_have = 999999999 def hasEnoughRefs(row): t1Count = counted_values[row["t1"]]# Count of times t1 is involved in a link t2Count = counted_values[row["t2"]]# Count of times t2 is involved in a link # true to keep return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have ) df_1 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)] print(df_1) G = nx.from_pandas_edgelist(df_1,source='t1',target='t2') fig, ax = plt.subplots(1, 1, figsize=(100, 100)); # https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style="dashed",width=0.5,edge_color='slategrey') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | one_must_have = 900 both_must_have = 10 one_must_not_have = 999999999 def hasEnoughRefs(row): t1Count = counted_values[row["t1"]]# Count of times t1 is involved in a link t2Count = counted_values[row["t2"]]# Count of times t2 is involved in a link # true to keep return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have ) df_2 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)] print(df_2) G = nx.from_pandas_edgelist(df_2,source='t1',target='t2') fig, ax = plt.subplots(1, 1, figsize=(100, 100)); # https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style="dashed",width=0.5,edge_color='slategrey') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | one_must_have = 1500 both_must_have = 5 one_must_not_have = 999999999 def hasEnoughRefs(row): t1Count = counted_values[row["t1"]]# Count of times t1 is involved in a link t2Count = counted_values[row["t2"]]# Count of times t2 is involved in a link # true to keep return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have ) df_2 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)] print(df_2) G = nx.from_pandas_edgelist(df_2,source='t1',target='t2') fig, ax = plt.subplots(1, 1, figsize=(100, 100)); # https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style="dashed",width=0.5,edge_color='slategrey') |
1 |