图分析工具库networkx

networkx是用python下开发的图论和复杂网络建模工具,内置了常用的图和复杂网络分析算法,可以方便的进行复杂网络数据分析、仿真建模等工作。

一、图的构建

STEP 1: 声明图对象
networkx有四种图 Graph 、DiGraph、MultiGraph、MultiDiGraph,分别为无多重边无向图、无多重边有向图、有多重边无向图、有多重边有向图。

1
2
3
4
5
6
7
8
import networkx as nx 

G = nx.Graph() # 创建无向图
G = nx.Graph(A) # 由邻接矩阵A创建无向图
G = nx.DiGraph() # 创建有向图
G = nx.DiGraph(A) # 由邻接矩阵A创建有向图
G = nx.MultiGraph()
G = nx.MultiDiGraph()

STEP 2: 添加顶点和边

1
2
3
4
5
6
7
8
9
10
# 方法一--------------------------------------
G.add_node('a') # 添加顶点a
G.add_node(1) # 用坐标来添加顶点
G.add_edge('x', 'y') # 添加边,起点为x,终点为y
G.add_weighted_edges_from([('x', 'y', 1.0)]) # 第三个输入量为权值
# 方法二--------------------------------------
l = [('a','b',5.0), ('b','c',3.0), ('a','c',1.0)]
G.add_weighted_edges_from(l)
# 方法三--------------------------------------
G = nx.from_pandas_edgelist(df_table, source = 'A', target = 'B', edge_attr = 'C', create_using = nx.DiGraph())

STEP 3: 查看图的信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
print('顶点的个数为', G.number_of_nodes())
print('边的条数为', G.number_of_edges())
print('图中顶点为', G.nodes())
print('图中边为', G.edges())
print('图中1顶点的邻居顶点为', [i for i in G.neighbors(1)])

G.add_weighted_edges_from([(1, 'y', {'a': 2, 'b': 9})])
print(G[1]['y']['weight']['a'])

# 查看所有权重大于3的边
for node, neighbors in G.adjacency():
for neighbor, edge_attr in neighbors.items():
w = edge_attr['weight']
if w > 3:
print(node, neighbor, w)

STEP 4: 可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# 方法一--------------------------------------
import networkx as nx
import matplotlib.pyplot as plt

G = nx.random_graphs.barabasi_albert_graph(10, m=3)

nx.draw(G,
node_size = 600, # 顶点大小
node_shape = 'o', # 顶点的形状
# node_color = 'y', # 顶点颜色,有rbykw
with_labels = True, # 顶点是否带标签(默认为True)
font_color = 'w', # 顶点标签字体颜色
alpha = 0.6, # 透明度 (默认是1.0,不透明,0为完全透明)
width = 0.7, # 边的宽度 (默认为1.0)
style = 'dashed', # 边的样式
edge_color = 'k', # 边的颜色
font_weight='bold',
pos = nx.shell_layout(G), # 布局, 例如spring_layout(多中心放射状分布)\shell_layout(同心圆分布)\circular_layout(圆环上均匀分布)\random_layout(随机分布)
)
plt.savefig("network.png")
# nx.write_gexf(G, 'network.gexf') # gexf格式文件可以导入gephi中进行分析
plt.show()

# 方法二--------------------------------------

import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.Graph()
G.add_weighted_edges_from([('0','1',2),('0','2',7),('1','2',3),('1','3',8),('1','4',5),('2','3',1),('3','4',4)])

# 顶点和边信息
labels={'0':'0','1':'1','2':'2','3':'3','4':'4'}
edge_labels = nx.get_edge_attributes(G, 'weight')
# 分布位置
pos = nx.spring_layout(G)
# 把边画出来
nx.draw_networkx_edges(G, pos=pos, width=1.0, alpha=0.5, edge_color=['b','r','b','r','r','b','r'])
# 把边权重画出来
nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edge_labels)
# 把顶点画出来
nx.draw_networkx_nodes(G, pos=pos, node_color='g', node_size=500, alpha=0.8)
# 把顶点的标签画出来
nx.draw_networkx_labels(G, pos=pos, labels=labels, font_size=16)
plt.show()

二、图的操作

1、移除某些顶点和边

1
2
3
4
5
6
7
8
9
10
11
12
import networkx as nx
import matplotlib.pyplot as plt

G = nx.random_graphs.barabasi_albert_graph(10, m=3)

# 移除部分顶点和边,移除所有的顶点和边使用G.clear()
G.remove_node(2)
G.remove_nodes_from([1, 5])
G.remove_edge(3, 4)

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

2、子图抽取

直接按顶点抽取:

1
2
3
4
5
6
7
8
9
10
11
12
import networkx as nx
import matplotlib.pyplot as plt

G = nx.random_graphs.barabasi_albert_graph(100, 3)

sub_G = G.subgraph([1, 3, 5, 6, 8])

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

nx.draw(sub_G, with_labels=True, font_weight='bold')
plt.show()

按条件抽取:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import networkx as nx
import matplotlib.pyplot as plt

# 定义有向图
G = nx.DiGraph()
nodes = {'a':{'id':1}, 'b':{'id':1}, 'c':{'id':3}, 'd':{'id':4}}
edges = [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'd')]
G.add_nodes_from(nodes.items())
G.add_edges_from(edges)

# 过滤函数
def flt_func_draw():
flt_func = lambda d: d['id'] != 4
return flt_func

# 过滤原图得到子图
flt_func = flt_func_draw()
sub_G = G.subgraph(n for n, d in G.nodes(data=True) if flt_func(d))

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

nx.draw(sub_G, with_labels=True, font_weight='bold')
plt.show()

3、图的合并

1
2
3
4
5
6
7
8
9
10
11
import networkx as nx
import matplotlib.pyplot as plt

G1 = nx.random_graphs.barabasi_albert_graph(10, m=3)

G2 = nx.random_graphs.barabasi_albert_graph(4, m=2)

G3 = nx.disjoint_union(G1, G2)

nx.draw(G3, with_labels=True, font_weight='bold')
plt.show()

4、有向图和无向图的转化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import networkx as nx
import matplotlib.pyplot as plt

# 定义无向图
G = nx.path_graph(8)
# 转换为有向图
G2 = G.to_directed()

nx.draw(G2, with_labels=True, font_weight='bold')
plt.show()

# 定义有向图
G3 = nx.path_graph(8, create_using=nx.DiGraph())
# 转换为无向图
G4 = G3.to_undirected()

nx.draw(G4, with_labels=True, font_weight='bold')
plt.show()

三、图的分析

1、度、度分布

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import networkx as nx

G = nx.random_graphs.barabasi_albert_graph(100, 3)

print(G.degree(0)) # 某个顶点的度
print(G.degree()) # 所有顶点的度

import matplotlib.pyplot as plt

degree = nx.degree_histogram(G) # 返回图中所有顶点的度分布序列
x = range(len(degree))
y = [z / float(sum(degree)) for z in degree] # 将频次转换为频率

plt.plot(x, y, color="blue", linewidth=2)
plt.show()

2、图矩阵

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import networkx as nx
import matplotlib.pyplot as plt

# 定义图的顶点和边
nodes = ['0','1','2','3','4','5','a','b','c']
edges = [('0','0',1),('0','1',1),('0','5',1),('0','5',2),('1','2',3),('1','4',5),('2','1',7),('2','4',6),('a','b',0.5),('b','c',0.5),('c','a',0.5)]

# 定义一个无向图和有向图
G1 = nx.Graph()
G1.add_nodes_from(nodes)
G1.add_weighted_edges_from(edges)
G2 = nx.DiGraph()
G2.add_nodes_from(nodes)
G2.add_weighted_edges_from(edges)

# 邻接矩阵
A = nx.adjacency_matrix(G1)
print('邻接矩阵:\n',A.todense())

# 关联矩阵
I = nx.incidence_matrix(G1)
print('关联矩阵:\n',I.todense())

# 拉普拉斯矩阵
L = nx.laplacian_matrix(G1)
print('拉普拉斯矩阵:\n',L.todense())

# 标准化的拉普拉斯矩阵
NL = nx.normalized_laplacian_matrix(G1)
print('标准化的拉普拉斯矩阵:\n',NL.todense())

# 有向图拉普拉斯矩阵
DL = nx.directed_laplacian_matrix(G2)
print('有向拉普拉斯矩阵:\n',DL)

# 拉普拉斯算子的特征值
LS = nx.laplacian_spectrum(G1)
print('拉普拉斯算子的特征值:\n',LS)

# 邻接矩阵的特征值
AS = nx.adjacency_spectrum(G1)
print('邻接矩阵的特征值:\n',AS)

# 无向图的代数连通性
AC = nx.algebraic_connectivity(G1)
print('无向图的代数连通性:\n',AC)

# 图的光谱排序
SO = nx.spectral_ordering(G1)
print('图的光谱排序:\n',SO)

3、遍历

广度遍历:

1
2
3
4
5
6
7
8
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为10的路径
G = nx.path_graph(10)

# 以4为顶点, 广度遍历
print(list(nx.bfs_tree(G, 4)))

深度遍历:

1
2
3
4
5
6
7
8
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为10的路径
G = nx.path_graph(10)

# 以5为顶点, 深度遍历, 限定深度为3
print(list(nx.dfs_tree(G, source=5, depth_limit=3)))

4、连通分析

连通子图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import networkx as nx
import matplotlib.pyplot as plt

nodes=['0','1','2','3','4','5','a','b','c']
edges=[('0','0',1),('0','1',1),('0','5',1),('0','5',2),('1','2',3),('1','4',5),('2','1',7),('2','4',6),('a','b',0.5),('b','c',0.5),('c','a',0.5)]

# 定义graph
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_weighted_edges_from(edges)

# 找到所有连通子图
print('connected_components of graph: ',list(nx.connected_components(G)))

# 显示该graph
nx.draw(G, with_labels=True, font_weight='bold')
plt.axis('on')
plt.xticks([])
plt.yticks([])
plt.show()

弱连通子图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import networkx as nx
import matplotlib.pyplot as plt

edges = [('0','0',1),('0','1',1),('0','5',1),('0','5',2),('1','2',3),('1','4',5),('2','1',7),('2','4',6),('a','b',0.5),('b','c',0.5),('c','a',0.5)]

# 定义graph
G = nx.DiGraph()
G.add_weighted_edges_from(edges)

# 找出所有的弱连通子图
for c in nx.weakly_connected_components(G):
print(c)

# 由大到小的规模判断弱连通子图
print([len(c) for c in sorted(nx.weakly_connected_components(G), key=len, reverse=True)])

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

强连通子图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import networkx as nx
import matplotlib.pyplot as plt

edges = [('0','0',1),('0','1',1),('0','5',1),('0','5',2),('1','2',3),('1','4',5),('2','1',7),('2','4',6),('a','b',0.5),('b','c',0.5),('c','a',0.5)]

# 定义graph
G = nx.DiGraph()
G.add_weighted_edges_from(edges)

# 找出所有的强连通子图
con = nx.strongly_connected_components(G)
print(list(con))

nx.draw(G, with_labels=True, font_weight='bold')
plt.show()

5、最短路径

无权图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.path_graph(5)
nx.add_path(G,[0, 5])
nx.add_path(G,[0, 6])

path1 = nx.single_source_shortest_path(G, 0) # 计算当前源与所有可达顶点的最短路径
length1 = nx.single_source_shortest_path_length(G, 0) # 计算当前源与所有可达顶点的最短路径的长度
print('当前源与所有可达顶点的最短路径: ', path1, '\n当前源与所有可达顶点的最短路径的长度:', length1)

path2 = dict(nx.all_pairs_shortest_path(G)) # 计算两两顶点之间的最短路径
length2 = dict(nx.all_pairs_shortest_path_length(G)) #计算graph两两顶点之间的最短路径的长度
print('\ngraph两两顶点之间的最短路径: ',path2, '\ngraph两两顶点之间的最短路径的长度: ',length2)

path3 = nx.shortest_path(G, source=0, target=5) # 指定源和目标计算最短路径
length3 = nx.shortest_path_length(G, source=0, target=5) # 指定源和目标计算最短路径长度
print('顶点0到顶点5的最短路径:', path3, '\n最短路径的长度:', length3)

print('平均最短路径长度:', nx.average_shortest_path_length(G))

print('检测顶点0到顶点5是否有路径:',nx.has_path(G, 0, 5))

加权图(迪杰斯特拉算法):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.path_graph(6, create_using=nx.DiGraph())
G.add_weighted_edges_from([(0, 5, 20.0)])
G.add_weighted_edges_from([(0, 4, 1.0)])

nx.draw(G, with_labels=True)
plt.show()

# 单源顶点最短加权路径和长度
length1, path1 = nx.single_source_dijkstra(G, 0)
print('单源顶点最短加权路径和长度: ', length1, path1)

# 多源顶点最短加权路径和长度
path2 = nx.multi_source_dijkstra_path(G, {0, 3})
length2 = nx.multi_source_dijkstra_path_length(G, {0, 3})
print('多源顶点最短加权路径和长度:', path2, length2)

# 两两顶点之间最短加权路径和长度
path3 = dict(nx.all_pairs_dijkstra_path(G))
length3 = dict(nx.all_pairs_dijkstra_path_length(G))
print('两两顶点之间最短加权路径和长度: ', path3, length3)

print('从源0到目标5的最短加权路径: ',nx.dijkstra_path(G, 0, 5))
print('从源0到目标5的最短加权路径的长度: ',nx.dijkstra_path_length(G, 0, 5))

加权图(贝尔曼-福特算法):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.path_graph(6, create_using=nx.DiGraph())
G.add_weighted_edges_from([(0, 5, 20.0)])
G.add_weighted_edges_from([(0, 4, 1.0)])

nx.draw(G, with_labels=True)
plt.show()

path1 = nx.single_source_bellman_ford_path(G, 0)
length1 = dict(nx.single_source_bellman_ford_path_length(G, 0))
print('单源顶点最短加权路径和长度:', path1, '单源顶点最短加权路径和长度:', length1)

path2 = dict(nx.all_pairs_bellman_ford_path(G))
length2 = dict(nx.all_pairs_bellman_ford_path_length(G))
print('两两顶点之间最短加权路径和长度:', path2, length2)

print('G中从源到目标的最短加权路径:', nx.bellman_ford_path(G, 0, 5))
print('G中从源到目标的最短加权路径的长度:', nx.bellman_ford_path_length(G, 0, 5))

加权图(Astar算法):

1
2
3
4
5
6
7
8
9
10
11
12
13
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.path_graph(6, create_using=nx.DiGraph())
G.add_weighted_edges_from([(0, 5, 20.0)])
G.add_weighted_edges_from([(0, 4, 1.0)])

nx.draw(G, with_labels=True)
plt.show()

print(nx.astar_path(G, 0, 5))
print(nx.astar_path_length(G, 0, 5))

6、最小/最大生成树

最小生成树:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.Graph()
G.add_weighted_edges_from([('0','1',2),('0','2',7),('1','2',3),('1','3',8),('1','4',5),('2','3',1),('3','4',4)])

# 求得最小生成树, algorithm可以是kruskal、prim、boruvka一种, 默认是kruskal
KA = nx.minimum_spanning_tree(G,algorithm='kruskal')
print(KA.edges(data=True))

# 直接拿到构成最小生成树的边, algorithm可以是kruskal、prim、boruvka一种, 默认是kruskal
mst = nx.minimum_spanning_edges(G, algorithm='kruskal', data=False)
edgelist = list(mst)
print(edgelist)

最大生成树:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import networkx as nx
import matplotlib.pyplot as plt

# 构建一个长度为5的路径
G = nx.Graph()
G.add_weighted_edges_from([('0','1',2),('0','2',7),('1','2',3),('1','3',8),('1','4',5),('2','3',1),('3','4',4)])

# 返回无向图G上的最大生成树或森林
T = nx.maximum_spanning_tree(G)
print(sorted(T.edges(data=True)))

# 直接拿到构成最大生成树, algorithm可以是kruskal、prim、boruvka一种, 默认是kruskal
mst = nx.tree.maximum_spanning_edges(G, algorithm='kruskal', data=False)
edgelist = list(mst)
print(edgelist)

7、群聚系数

1
2
3
4
5
6
import networkx as nx

G = nx.random_graphs.barabasi_albert_graph(1000, 3)

print(nx.average_clustering(G)) # 平均群聚系数
print(nx.clustering(G)) # 各个顶点的群聚系数

8、中心性度量

度中心性:
在社会网络中,对于具有更多连接关系的顶点,度中心性度量方法认为他们具有更高的中心性。可以利用顶点的度来衡量连接关系。

1
2
3
4
5
6
7
8
9
10
11
12
13
import networkx as nx
import numpy as np

def centralityDegree(G):
centrality = {}
s = 1.0 / (len(G) - 1.0)
centrality = {n : d * s for n, d in G.degree()}
centrality = sorted(centrality.items(), key = lambda item:item[1], reverse = True)
print(centrality)

a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10))
G = nx.DiGraph(a)
centralityDegree(G)

特征向量中心性:
在度中心性度量中,我们认为具有较多连接的顶点更重要。然而在现实中,拥有更多朋友并不能确保这个人就是重要的,拥有更多重要的朋友才能提供更有力的信息。
因此,我们试图用邻居顶点的重要性来概括本顶点的重要性。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import networkx as nx
import numpy as np
import scipy as sp
from scipy.sparse import linalg

def centralityEigenvector(G, max_iter=50, tol=0):
M = nx.to_scipy_sparse_matrix(G, nodelist = list(G), weight = None, dtype=float)
eigenvalue, eigenvector = linalg.eigs(M.T, k=1, which='LR', maxiter=max_iter, tol=tol)
largest = eigenvector.flatten().real
norm = sp.sign(largest.sum()) * sp.linalg.norm(largest)
return dict(zip(G, largest / norm))

a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10))
G = nx.DiGraph(a)
eigenvector = centralityEigenvector(G)
eigenvector = sorted(eigenvector.items(), key = lambda item:item[1], reverse = True)
print(eigenvector)

Katz中心性:
在特征向量中心性度量中,会有这么一个问题:某个点被大量的关注,但是关注该点的点的中心性很低(甚至为0),这样会导致网络中存在被大量关注但中心值却为0的点。因此,我们需要在特征向量中心性度量的基础上加入一个偏差项(或者说是中心值下限),来解决这个问题。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import networkx as nx
import numpy as np

def centralityKatz(G, alpha = 0.1, beta = 1.0):
try:
nodelist = beta.keys()
b = np.array(list(beta.values(), dtype = float))
except AttributeError:
nodelist = list(G)
b = np.ones((len(nodelist), 1)) * float(beta)

A = nx.adj_matrix(G, nodelist=nodelist, weight= None).todense().T
n = A.shape[0]
centrality = np.linalg.solve(np.eye(n, n) - (alpha * A), b)
norm = np.sign(sum(centrality)) * np.linalg.norm(centrality)
centrality = dict(zip(nodelist, map(float, centrality / norm)))
return centrality

a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10))
G = nx.DiGraph(a)
Katzeigenvector = centralityKatz(G, alpha = 0.3, beta = 0.3)
Katzeigenvector = sorted(Katzeigenvector.items(), key = lambda item:item[1], reverse = True)
print(Katzeigenvector)

PageRank:
Katz在某些情况下存在一些与特征向量中心性相似的问题。在有向图中,一旦一个顶点成为一个权威顶点(高中心值结点),它将向它所有的外连接传递其中心性,导致其它顶点中心性变的很高。但这是不可取的,因为不是每一个被名人所知的人都是有名的。因此我们在katz中心性的基础上,累加时让每一个邻居的中心性除以该邻居顶点的出度,这种度量称为PageRank。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import networkx as nx
import numpy as np

def Matrix(G, alpha=0.85, personalization=None, nodelist=None, weight='weight'):
if personalization is None:
nodelist = G.nodes()
else:
nodelist = personalization.keys()
M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
(n, m) = M.shape
if n == 0:
return M
dangling = np.where(M.sum(axis=1)==0)
for d in dangling[0]:
M[d] = 1.0/n
M = M / M.sum(axis=1)
e = np.ones((n))
if personalization is not None:
v = np.array(list(personalization.values()), dtype=float)
else:
v = e
v = v / v.sum()
P = alpha*M+(1-alpha)*np.outer(e,v)
return P

def PageRank(G, alpha=0.85, personalization=None, weight='weight'):
if personalization is None:
nodelist = G.nodes()
else:
nodelist = personalization.keys()
M = Matrix(G, alpha, personalization=personalization, nodelist=nodelist, weight=weight)

eigenvalues, eigenvectors = np.linalg.eig(M.T)
ind = eigenvalues.argsort()
largest = np.array(eigenvectors[:,ind[-1]]).flatten().real
norm = float(largest.sum())
centrality = dict(zip(nodelist,map(float,largest/norm)))
return centrality

a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10))
G = nx.DiGraph(a)
pagerank = PageRank(G, alpha = 0.3)
pagerank = sorted(pagerank.items(), key = lambda item:item[1], reverse = True)
print(pagerank)

betweenness:
另一种中心性度量方法是考虑顶点在连接其他顶点时所表现出的重要性。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import networkx as nx
import random
import numpy as np

def centralityBetweenness(G, k=None, normalized=True):
betweenness = dict.fromkeys(G, 0.0)
nodes = G

for s in nodes:
S, P, sigma = _single_source_shortest_path(G, s)
betweenness = _accumulate(betweenness, S, P, sigma, s)

betweenness = _rescale(betweenness, len(G), normalized=normalized)
return betweenness

def _single_source_shortest_path(G, s):
S = []
P = {}
for v in G:
P[v] = []
sigma = dict.fromkeys(G, 0.0) # sigma[v]=0 for v in G
D = {}
sigma[s] = 1.0
D[s] = 0
Q = [s]
while Q: # use BFS to find shortest paths
v = Q.pop(0)
S.append(v)
Dv = D[v]
sigmav = sigma[v]
for w in G[v]:
if w not in D:
Q.append(w)
D[w] = Dv + 1
if D[w] == Dv + 1: # this is a shortest path, count paths
sigma[w] += sigmav
P[w].append(v) # predecessors
return S, P, sigma

def _accumulate(betweenness, S, P, sigma, s):
delta = dict.fromkeys(S, 0)
while S:
w = S.pop()
coeff = (1.0 + delta[w]) / sigma[w]
for v in P[w]:
delta[v] += sigma[v] * coeff
if w != s:
betweenness[w] += delta[w]
return betweenness

def _rescale(betweenness, n, normalized, directed=False):
if normalized:
if n <= 2:
scale = None # no normalization b=0 for all nodes
else:
scale = 1.0 / ((n - 1) * (n - 2))
else:
scale = 0.5

if scale is not None:
for v in betweenness:
betweenness[v] *= scale
return betweenness

def topNBetweeness():
a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10))
G = nx.DiGraph(a)
score = centralityBetweenness(G)
score = sorted(score.items(), key = lambda item:item[1], reverse = True)
print("betweenness_centrality:", score)
output = []
for node in score:
output.append(node[0])

print(output)

topNBetweeness()

closeness:
接近中心性的思想是顶点越趋于中心,它们越能快速地到达其他的顶点。更形式化的描述,这些顶点满足与其他顶点之间有最小的平均最短路径。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import networkx as nx
import random
import numpy as np

def closenessCentrality(G):
path_length = nx.single_source_shortest_path_length
nodes = G.nodes()
closeness_centrality = {}
for n in nodes:
sp = path_length(G, n)
totsp = sum(sp.values())
if totsp > 0.0 and len(G) > 1:
closeness_centrality[n] = (len(sp)-1.0) / totsp
# normalize to number of nodes-1 in connected part
s = (len(sp)-1.0) / (len(G) - 1)
closeness_centrality[n] *= s
else:
closeness_centrality[n] = 0.0

return closeness_centrality

def topNBetweeness():
a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10))
G = nx.DiGraph(a)
score = closenessCentrality(G)
score = sorted(score.items(), key=lambda item: item[1], reverse=True)
print("closenessCentrality:", score)
output = []
for node in score:
output.append(node[0])

print(output)

topNBetweeness()

四、网络图模型

1、随机图模型

随机网络(如ER模型),连接是随机设置的,但大部分顶点的连接数目会大致相同,即顶点的分布方式遵循钟形的泊松分布,有一个特征性的“平均数”。连接数目比平均数高许多或低许多的顶点都极少,随着连接数的增大,其概率呈指数式迅速递减。故随机网络亦称指数网络。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import networkx as nx
import itertools
import random
import matplotlib.pyplot as plt

def random_graph(n, p):
G = nx.Graph()
G.add_nodes_from(range(n))
if p < 0:
return G
if p > 1:
return nx.complete_graph(n, create_using=G)
edges = itertools.combinations(range(n), 2)
for e in edges:
if random.random() < p:
G.add_edge(*e)
return G

ER = random_graph(15, 0.2)
pos = nx.shell_layout(ER)
nx.draw(ER, pos, with_labels=False, node_size=30)
plt.show()

2、小世界模型

在网络理论中,小世界网络是一类特殊的复杂网络结构,在这种网络中大部份的顶点彼此并不相连,但绝大部份顶点之间经过少数几步就可到达。
在日常生活中,有时你会发现,某些你觉得与你隔得很“遥远”的人,其实与你“很近”。小世界网络就是对这种现象(也称为小世界现象)的数学描述。用数学中图论的语言来说,小世界网络就是一个由大量顶点构成的图,其中任意两点之间的平均路径长度比顶点数量小得多。除了社会人际网络以外,小世界网络的例子在生物学、物理学、计算机科学等领域也有出现。
二十世纪60年代,美国哈佛大学社会心理学家斯坦利·米尔格伦(Stanley Milgram)做了一个连锁信实验。他将一些信件交给自愿的参加者,要求他们通过自己的熟人将信传到信封上指明的收信人手里,他发现,20%的信件最终送到了目标人物手中。而在成功传递的信件中,平均只需要6.5次转发,就能够到达目标。也就是说,在社会网络中,任意两个人之间的“距离”是6。这就是所谓的“六度分隔”理论。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import networkx as nx
import matplotlib.pyplot as plt
import random

def smallWorld_graph(n, k, p):
G = nx.Graph()
nodes = list(range(n))

for j in range(1, k // 2 + 1):
targets = nodes[j:] + nodes[0:j]
G.add_edges_from(zip(nodes, targets))

for j in range(1, k // 2 + 1):
targets = nodes[j:] + nodes[0:j]
for u, v in zip(nodes, targets):
if random.random() < p:
w = random.choice(nodes)
while w == u or G.has_edge(u, w):
w = random.choice(nodes)
if G.degree(u) >= n - 1:
break
else:
G.remove_edge(u, v)
G.add_edge(u, w)
return G

# 生成包含20个顶点、每个顶点4个近邻、随机化重连概率为0.2的小世界网络
WS = smallWorld_graph(20, 4, 0.2)
pos = nx.circular_layout(WS)
nx.draw(WS,pos,with_labels=False,node_size = 30)
plt.show()

3、优先链接模型

无标度网络模型(BA模型)具有两个特性,其一是增长性,所谓增长性是指网络规模是在不断的增大的,在研究的网络当中,网络的顶点是不断的增加的;其二就是优先连接机制,这个特性是指网络当中不断产生的新的顶点更倾向于和那些连接度较大的顶点相连接。BA模型对很多的现象都是可以解释的,例如研究生对导师的选择,在这个网络当中,研究生和导师都是不断增加的,而研究生总是倾向于选择已经带过很多研究生的导师。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import networkx as nx
import matplotlib.pyplot as plt
import itertools
import math
import random

def empty_graph(n):
G=nx.Graph()
G.add_nodes_from(range(n))
return G

def random_subset(seq, m):
targets = set()
while len(targets) < m:
x = random.choice(seq)
targets.add(x)
return targets

def baModel(n, m):
G = empty_graph(m)
targets = list(range(m))
repeated_nodes = []
source = m
while source < n:
G.add_edges_from(zip([source] * m, targets))
repeated_nodes.extend(targets)
repeated_nodes.extend([source] * m)
targets = random_subset(repeated_nodes, m)
source += 1
return G

BA = baModel(20, 1)
pos = nx.spring_layout(BA)
nx.draw(BA, pos, with_labels=False, node_size=30)
plt.show()
0%