Statistics
| Branch: | Revision:

iof-tools / networkxMiCe / networkx-master / examples / graph / plot_roget.py @ 5cef0f13

History | View | Annotate | Download (2.61 KB)

1
#!/usr/bin/env python
2
"""
3
=====
4
Roget
5
=====
6

7
Build a directed graph of 1022 categories and
8
5075 cross-references as defined in the 1879 version of Roget's Thesaurus
9
contained in the datafile roget_dat.txt. This example is described in
10
Section 1.2 in Knuth's book (see [1]_ and [2]_).
11

12
Note that one of the 5075 cross references is a self loop yet
13
it is included in the graph built here because
14
the standard networkx `DiGraph` class allows self loops.
15
(cf. 400pungency:400 401 403 405).
16

17
References
18
----------
19

20
.. [1] Donald E. Knuth,
21
   "The Stanford GraphBase: A Platform for Combinatorial Computing",
22
   ACM Press, New York, 1993.
23
.. [2] http://www-cs-faculty.stanford.edu/~knuth/sgb.html
24
"""
25

    
26
from __future__ import print_function
27

    
28
# Authors: Brendt Wohlberg, Aric Hagberg (hagberg@lanl.gov)
29
# Date: 2005-04-01 07:56:22 -0700 (Fri, 01 Apr 2005)
30

    
31
#    Copyright (C) 2004-2019 by
32
#    Aric Hagberg <hagberg@lanl.gov>
33
#    Dan Schult <dschult@colgate.edu>
34
#    Pieter Swart <swart@lanl.gov>
35
#    All rights reserved.
36
#    BSD license.
37

    
38
import gzip
39
import re
40
import sys
41

    
42
import matplotlib.pyplot as plt
43
from networkx import nx
44

    
45
def roget_graph():
46
    """ Return the thesaurus graph from the roget.dat example in
47
    the Stanford Graph Base.
48
    """
49
    # open file roget_dat.txt.gz (or roget_dat.txt)
50
    fh = gzip.open('roget_dat.txt.gz', 'r')
51

    
52
    G = nx.DiGraph()
53

    
54
    for line in fh.readlines():
55
        line = line.decode()
56
        if line.startswith("*"):  # skip comments
57
            continue
58
        if line.startswith(" "):  # this is a continuation line, append
59
            line = oldline + line
60
        if line.endswith("\\\n"):  # continuation line, buffer, goto next
61
            oldline = line.strip("\\\n")
62
            continue
63

    
64
        (headname, tails) = line.split(":")
65

    
66
        # head
67
        numfind = re.compile("^\d+")  # re to find the number of this word
68
        head = numfind.findall(headname)[0]  # get the number
69

    
70
        G.add_node(head)
71

    
72
        for tail in tails.split():
73
            if head == tail:
74
                print("skipping self loop", head, tail, file=sys.stderr)
75
            G.add_edge(head, tail)
76

    
77
    return G
78

    
79

    
80
if __name__ == '__main__':
81
    G = roget_graph()
82
    print("Loaded roget_dat.txt containing 1022 categories.")
83
    print("digraph has %d nodes with %d edges"
84
          % (nx.number_of_nodes(G), nx.number_of_edges(G)))
85
    UG = G.to_undirected()
86
    print(nx.number_connected_components(UG), "connected components")
87

    
88
    options = {
89
        'node_color': 'black',
90
        'node_size': 1,
91
        'line_color': 'grey',
92
        'linewidths': 0,
93
        'width': 0.1,
94
    }
95
    nx.draw_circular(UG, **options)
96
    plt.show()