{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "68518411", "metadata": {}, "outputs": [], "source": [ "#!pip install jaal pandas networkx matplotlib snap-stanford\n", "!pip install snap-stanford\n", "!pip install networkx\n", "\n", "# Hide errors due to unknown chars!\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import pandas as pd\n", "import re\n", "import json\n", "import numpy as np\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": null, "id": "cbdc173a", "metadata": {}, "outputs": [], "source": [ "# Quarry query for user talk page relations\n", "\n", "# User only relations t1 and t2 columns\n", "# https://quarry.wmcloud.org/query/60589\n", "# https://quarry.wmcloud.org/run/600192/output/0/csv\n", "\n", "# User and project relations, t1, t1ns, t2, t2ns\n", "# https://quarry.wmcloud.org/query/60592\n", "# https://quarry.wmcloud.org/run/600233/output/0/csv\n", "\n", "CSV_URL = 'https://quarry.wmcloud.org/run/600233/output/0/csv'" ] }, { "cell_type": "code", "execution_count": null, "id": "b6a39601", "metadata": {}, "outputs": [], "source": [ "# Load the data\n", "df = pd.read_csv(CSV_URL)\n", "print(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "41a73c1c", "metadata": {}, "outputs": [], "source": [ "# Filter & sort data\n", "# - Remove IPs\n", "# - Find the most linked nodes\n", "# - Filter out some of the cruft\n", "ipv4 = r\"\\d+\\.\\d+\\.\\d+\\.\\d+\"\n", "ipv6 = r\"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\"\n", "\n", "# Remove IPs\n", "def hasip(row):\n", " # true to keep, if no IP matches\n", " return re.match(ipv4, row[\"t1\"]) is None and re.match(ipv4, row[\"t2\"]) is None and re.match(ipv6, row[\"t1\"]) is None and re.match(ipv6, row[\"t2\"]) is None\n", "df_filtered = df[df.apply(hasip, axis=1)]\n", "\n", "# Count value occourance & give every value an int\n", "values = list(df_filtered['t1'].values) + list(df_filtered['t2'].values)\n", "counted_values = Counter(values)\n", "#unique_values_list = list(counted_values.keys())\n", "#val_to_int = {}\n", "#for i in range(0, len(unique_values_list)) :\n", "# val_to_int[unique_values_list[i]] = i\n", " \n", "#print(counted_values)" ] }, { "cell_type": "code", "execution_count": null, "id": "9497faf1", "metadata": {}, "outputs": [], "source": [ "one_must_have = 200\n", "both_must_have = 25\n", "one_must_not_have = 999999999\n", "def hasEnoughRefs(row):\n", " t1Count = counted_values[row[\"t1\"]]# Count of times t1 is involved in a link\n", " t2Count = counted_values[row[\"t2\"]]# Count of times t2 is involved in a link\n", " # true to keep\n", " return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have )\n", "df_1 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)]\n", "\n", "print(df_1)\n", "\n", "G = nx.from_pandas_edgelist(df_1,source='t1',target='t2')\n", "fig, ax = plt.subplots(1, 1, figsize=(100, 100));\n", "# https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html\n", "nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style=\"dashed\",width=0.5,edge_color='slategrey')" ] }, { "cell_type": "code", "execution_count": null, "id": "e272fe28", "metadata": {}, "outputs": [], "source": [ "one_must_have = 900\n", "both_must_have = 10\n", "one_must_not_have = 999999999\n", "def hasEnoughRefs(row):\n", " t1Count = counted_values[row[\"t1\"]]# Count of times t1 is involved in a link\n", " t2Count = counted_values[row[\"t2\"]]# Count of times t2 is involved in a link\n", " # true to keep\n", " return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have )\n", "df_2 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)]\n", "\n", "print(df_2)\n", "\n", "G = nx.from_pandas_edgelist(df_2,source='t1',target='t2')\n", "fig, ax = plt.subplots(1, 1, figsize=(100, 100));\n", "# https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html\n", "nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style=\"dashed\",width=0.5,edge_color='slategrey')" ] }, { "cell_type": "code", "execution_count": null, "id": "37350a1f", "metadata": {}, "outputs": [], "source": [ "one_must_have = 1500\n", "both_must_have = 5\n", "one_must_not_have = 999999999\n", "def hasEnoughRefs(row):\n", " t1Count = counted_values[row[\"t1\"]]# Count of times t1 is involved in a link\n", " t2Count = counted_values[row[\"t2\"]]# Count of times t2 is involved in a link\n", " # true to keep\n", " return ( t1Count >= one_must_have or t2Count >= one_must_have ) and ( t1Count >= both_must_have and t2Count >= both_must_have ) and ( t1Count <= one_must_not_have or t2Count <= one_must_not_have )\n", "df_2 = df_filtered[df_filtered.apply(hasEnoughRefs, axis=1)]\n", "\n", "print(df_2)\n", "\n", "G = nx.from_pandas_edgelist(df_2,source='t1',target='t2')\n", "fig, ax = plt.subplots(1, 1, figsize=(100, 100));\n", "# https://networkx.org/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html\n", "nx.draw_networkx(G, ax=ax,with_labels=True, node_size=5,style=\"dashed\",width=0.5,edge_color='slategrey')" ] }, { "cell_type": "code", "execution_count": null, "id": "91e6f8a8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }