Skip to content

Commit 487d970

Browse files
authored
Fixes #4138: Add support for loading Gephi GEXF file format (#4171)
* Fixes #4138: Add support for loading Gephi GEXF file format * removed unused imports * Fixed RollupTest
1 parent 0e6969a commit 487d970

File tree

12 files changed

+1455
-37
lines changed

12 files changed

+1455
-37
lines changed

docs/asciidoc/modules/ROOT/nav.adoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ include::partial$generated-documentation/nav.adoc[]
2626
** xref::import/load-csv.adoc[]
2727
** xref::import/xls.adoc[]
2828
** xref::import/html.adoc[]
29+
** xref::import/parquet.adoc[]
30+
** xref::import/gexf.adoc[]
2931
3032
* xref:export/index.adoc[]
3133
** xref::export/xls.adoc[]
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
[[gexf]]
2+
= Load GEXF (Graph Exchange XML Format)
3+
:description: This section describes procedures that can be used to import data from GEXF files.
4+
5+
6+
7+
Many existing applications and data integrations use GEXF to describes a graph with nodes and edges.
8+
For further information, you should visit the https://gexf.net/[official documentation].
9+
10+
It is possible to load or import nodes and relationship from a GEXF file with the procedures
11+
`apoc.load.gexf` and `apoc.import.gexf`. You need to:
12+
13+
* provide a path to a GEXF file
14+
* provide configuration (optional)
15+
16+
The `apoc.import.gexf` read as the `apoc.load.gexf` but also create nodes and relationships in Neo4j.
17+
18+
For reading from files you'll have to enable the config option:
19+
20+
----
21+
apoc.import.file.enabled=true
22+
----
23+
24+
By default file paths are global, for paths relative to the `import` directory set:
25+
26+
----
27+
apoc.import.file.use_neo4j_config=true
28+
----
29+
30+
== Examples for apoc.load.gexf
31+
32+
.load.gexf
33+
----
34+
<?xml version="1.0" encoding="UTF-8"?>
35+
<gexf version="1.2">
36+
<graph defaultedgetype="directed">
37+
<nodes>
38+
<node foo="bar">
39+
<attvalues>
40+
<attvalue for="0" value="http://gephi.org"/>
41+
</attvalues>
42+
</node>
43+
</nodes>
44+
</graph>
45+
</gexf>
46+
----
47+
48+
[source, cypher]
49+
----
50+
CALL apoc.load.gexf('load.gexf')
51+
----
52+
53+
.Results
54+
[opts="header"]
55+
|===
56+
| value
57+
| {_type: gexf, _children: [{_type: graph, defaultedgetype: directed, _children: [{_type: nodes, _children: [{_type: node, _children: [{_type: attvalues, _children: [{_type: attvalue, for: 0, value: http://gephi.org}]}], foo: bar}]}]}], version: 1.2}
58+
|===
59+
60+
== Examples for apoc.import.gexf
61+
62+
Besides the file you can pass in a config map:
63+
64+
.Config parameters
65+
[opts=header]
66+
|===
67+
| name | type | default | description
68+
| readLabels | Boolean | false | Creates node labels based on the value in the `labels` property of `node` elements
69+
| defaultRelationshipType | String | RELATED | The default relationship type to use if none is specified in the GraphML file
70+
| storeNodeIds | Boolean | false | store the `id` property of `node` elements
71+
| batchSize | Integer | 20000 | The number of elements to process per transaction
72+
| compression | `Enum[NONE, BYTES, GZIP, BZIP2, DEFLATE, BLOCK_LZ4, FRAMED_SNAPPY]` | `null` | Allow taking binary data, either not compressed (value: `NONE`) or compressed (other values)
73+
| source | Map<String,String> | Empty map | See `source / target config` parameter below
74+
| target | Map<String,String> | Empty map | See `source / target config` parameter below
75+
See the xref::overview/apoc.load/apoc.load.csv.adoc#_binary_file[Binary file example]
76+
|===
77+
78+
79+
With the following file will be created:
80+
81+
* 1 node with label Gephi
82+
* 2 nodes with label Webatlas
83+
* 1 node with label RTGI
84+
* 1 node with label BarabasiLab
85+
* 6 relationships of kind KNOWS
86+
* 1 relationship of kind HAS_TICKET
87+
* 1 relationship of kind BAZ
88+
89+
.data.gexf
90+
----
91+
<?xml version="1.0" encoding="UTF-8"?>
92+
<gexf xmlns="http://gexf.net/1.3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gexf.net/1.3 http://gexf.net/1.3/gexf.xsd" version="1.2">
93+
<meta lastmodifieddate="2009-03-20">
94+
<creator>Gephi.org</creator>
95+
<description>A Web network</description>
96+
</meta>
97+
<graph defaultedgetype="directed">
98+
<attributes class="node">
99+
<attribute id="0" title="url" type="string"/>
100+
<attribute id="room" title="room" type="integer"/>
101+
<attribute id="projects" title="projects" type="long"/>
102+
<attribute id="price" title="price" type="double"/>
103+
<attribute id="1" title="indegree" type="float"/>
104+
<attribute id="members" title="members" type="liststring"/>
105+
<attribute id="pins" title="pins" type="listboolean"/>
106+
<attribute id="2" title="frog" type="boolean">
107+
<default>true</default>
108+
</attribute>
109+
</attributes>
110+
<attributes class="edge">
111+
<attribute id="score" title="score" type="float"/>
112+
</attributes>
113+
<nodes>
114+
<node id="0" label="Gephi">
115+
<attvalues>
116+
<attvalue for="0" value="http://gephi.org"/>
117+
<attvalue for="1" value="1"/>
118+
<attvalue for="room" value="10"/>
119+
<attvalue for="price" value="10.02"/>
120+
<attvalue for="projects" value="300"/>
121+
<attvalue for="members" value="[Altomare, Sterpeto, Lino]"/>
122+
<attvalue for="pins" value="[true, false, true, false]"/>
123+
</attvalues>
124+
</node>
125+
<node id="5" label="Gephi">
126+
<attvalues>
127+
<attvalue for="0" value="http://test.gephi.org"/>
128+
<attvalue for="1" value="2"/>
129+
</attvalues>
130+
</node>
131+
<node id="1" label="Webatlas">
132+
<attvalues>
133+
<attvalue for="0" value="http://webatlas.fr"/>
134+
<attvalue for="1" value="2"/>
135+
</attvalues>
136+
</node>
137+
<node id="2" label="RTGI">
138+
<attvalues>
139+
<attvalue for="0" value="http://rtgi.fr"/>
140+
<attvalue for="1" value="1"/>
141+
</attvalues>
142+
</node>
143+
<node id="3" label=":BarabasiLab:Webatlas">
144+
<attvalues>
145+
<attvalue for="0" value="http://barabasilab.com"/>
146+
<attvalue for="1" value="1"/>
147+
<attvalue for="2" value="false"/>
148+
</attvalues>
149+
</node>
150+
</nodes>
151+
<edges>
152+
<edge source="0" target="1" kind="KNOWS">
153+
<attvalues>
154+
<attvalue for="score" value="1.5"/>
155+
</attvalues>
156+
</edge>
157+
<edge source="0" target="0" kind="BAZ">
158+
<attvalues>
159+
<attvalue for="foo" value="bar"/>
160+
<attvalue for="score" value="2"/>
161+
</attvalues>
162+
</edge>
163+
<edge source="0" target="2" kind="HAS_TICKET">
164+
<attvalues>
165+
<attvalue for="ajeje" value="brazorf"/>
166+
<attvalue for="score" value="3"/>
167+
</attvalues>
168+
</edge>
169+
<edge source="0" target="2" kind="KNOWS" />
170+
<edge source="1" target="0" kind="KNOWS" />
171+
<edge source="2" target="1" kind="KNOWS" />
172+
<edge source="0" target="3" kind="KNOWS" />
173+
<edge source="5" target="3" kind="KNOWS" />
174+
</edges>
175+
</graph>
176+
</gexf>
177+
----
178+
179+
[source, cypher]
180+
----
181+
CALL apoc.import.gexf('data.gexf', {readLabels:true})
182+
----
183+
184+
.Results
185+
[opts="header"]
186+
|===
187+
| value
188+
| {
189+
"relationships" : 8,
190+
"batches" : 0,
191+
"file" : "file:/../data.gexf",
192+
"nodes" : 5,
193+
"format" : "gexf",
194+
"source" : "file",
195+
"time" : 9736,
196+
"rows" : 0,
197+
"batchSize" : -1,
198+
"done" : true,
199+
"properties" : 21
200+
}
201+
|===
202+
203+
We can also store the node IDs by executing:
204+
[source, cypher]
205+
----
206+
CALL apoc.import.gexf('data.gexf', {readLabels:true, storeNodeIds: true})
207+
----
208+
209+
=== source / target config
210+
211+
Allows the import of relations in case the source and / or target nodes are not present in the file, searching for nodes via a custom label and property.
212+
To do this, we can insert into the config map `source: {label: '<MY_SOURCE_LABEL>', id: `'<MY_SOURCE_ID>'`}` and/or `source: {label: '<MY_TARGET_LABEL>', id: `'<MY_TARGET_ID>'`}`
213+
In this way, we can search start and end nodes via the source and end attribute of `edge` tag.
214+
215+
For example, with a config map `{source: {id: 'myId', label: 'Foo'}, target: {id: 'other', label: 'Bar'}}`
216+
with a edge row like `<edge id="e0" source="n0" target="n1" label="KNOWS"><data key="label">KNOWS</data></edge>`
217+
we search a source node `(:Foo {myId: 'n0'})` and an end node `(:Bar {other: 'n1'})`.
218+
The id key is optional (the default is `'id'`).
219+
220+
221+
222+

docs/asciidoc/modules/ROOT/pages/import/index.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ For more information on these procedures, see:
1313
* xref::import/xls.adoc[]
1414
* xref::import/html.adoc[]
1515
* xref::import/parquet.adoc[]
16+
* xref::import/gexf.adoc[]
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package apoc.load;
2+
3+
import apoc.Extended;
4+
import apoc.Pools;
5+
import apoc.export.util.CountingReader;
6+
import apoc.export.util.ExportConfig;
7+
import apoc.export.util.ProgressReporter;
8+
import apoc.load.util.XmlReadUtil.Import;
9+
import apoc.result.MapResult;
10+
import apoc.result.ProgressInfo;
11+
import apoc.util.FileUtils;
12+
import apoc.util.Util;
13+
import org.neo4j.graphdb.GraphDatabaseService;
14+
import org.neo4j.graphdb.security.URLAccessChecker;
15+
import org.neo4j.procedure.Context;
16+
import org.neo4j.procedure.Description;
17+
import org.neo4j.procedure.Mode;
18+
import org.neo4j.procedure.Name;
19+
import org.neo4j.procedure.Procedure;
20+
import org.neo4j.procedure.TerminationGuard;
21+
22+
import java.util.Map;
23+
import java.util.stream.Stream;
24+
25+
import static apoc.load.util.XmlReadUtil.Load.xmlXpathToMapResult;
26+
27+
@Extended
28+
public class Gexf {
29+
30+
@Context
31+
public GraphDatabaseService db;
32+
33+
@Context
34+
public URLAccessChecker urlAccessChecker;
35+
36+
@Context
37+
public TerminationGuard terminationGuard;
38+
39+
@Context
40+
public Pools pools;
41+
42+
@Procedure("apoc.load.gexf")
43+
@Description("apoc.load.gexf(urlOrBinary, path, $config) - load Gexf file from URL or binary source")
44+
public Stream<MapResult> gexf(
45+
@Name("urlOrBinary") Object urlOrBinary,
46+
@Name(value = "config", defaultValue = "{}") Map<String, Object> config
47+
) throws Exception {
48+
return xmlXpathToMapResult(urlOrBinary, urlAccessChecker, terminationGuard, config);
49+
}
50+
51+
@Procedure(name = "apoc.import.gexf", mode = Mode.WRITE)
52+
@Description("Imports a graph from the provided GraphML file.")
53+
public Stream<ProgressInfo> importGexf(
54+
@Name("urlOrBinaryFile") Object urlOrBinaryFile, @Name("config") Map<String, Object> config) {
55+
ProgressInfo result = Util.inThread(pools, () -> {
56+
ExportConfig exportConfig = new ExportConfig(config);
57+
String file = null;
58+
String source = "binary";
59+
if (urlOrBinaryFile instanceof String) {
60+
file = (String) urlOrBinaryFile;
61+
source = "file";
62+
}
63+
ProgressReporter reporter = new ProgressReporter(null, null, new ProgressInfo(file, source, "gexf"));
64+
Import graphReader = new Import(db)
65+
.reporter(reporter)
66+
.batchSize(exportConfig.getBatchSize())
67+
.relType(exportConfig.defaultRelationshipType())
68+
.source(exportConfig.getSource())
69+
.target(exportConfig.getTarget())
70+
.nodeLabels(exportConfig.readLabels());
71+
72+
if (exportConfig.storeNodeIds()) graphReader.storeNodeIds();
73+
74+
try (CountingReader reader =
75+
FileUtils.readerFor(urlOrBinaryFile, exportConfig.getCompressionAlgo(), urlAccessChecker)) {
76+
graphReader.parseXML(reader, terminationGuard);
77+
}
78+
79+
return reporter.getTotal();
80+
});
81+
return Stream.of(result);
82+
}
83+
}

0 commit comments

Comments
 (0)