Skip to content

Commit 0ea872b

Browse files
More utilities for graph exploration:
- isArc(from, to) - intersection and difference sets on sorted node lists - intersection of successors/predecessors with a node list - subgraph metrics for a given list of nodes defining the subgraph
1 parent adb9e3c commit 0ea872b

1 file changed

Lines changed: 112 additions & 0 deletions

File tree

  • src/main/java/org/commoncrawl/webgraph/explore

src/main/java/org/commoncrawl/webgraph/explore/Graph.java

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,22 @@ public long vertexLabelToId(String label) {
125125
}
126126
}
127127

128+
public boolean isArc(long fromId, long toId) {
129+
final LazyIntIterator succors = graph.successors((int) fromId);
130+
for (int s; (s = succors.nextInt()) != -1;) {
131+
if (s == toId) {
132+
return true;
133+
} else if (s > toId) {
134+
break;
135+
}
136+
}
137+
return false;
138+
}
139+
140+
public boolean isArc(String from, String to) {
141+
return isArc(vertexLabelToId(from), vertexLabelToId(to));
142+
}
143+
128144
public int outdegree(long vertexId) {
129145
return graph.outdegree((int) vertexId);
130146
}
@@ -185,6 +201,10 @@ public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interva
185201
return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0));
186202
}
187203

204+
public long[] successorIntersect(long vertexId, long[] vertexIds) {
205+
return intersect(vertexIds, successors(vertexId));
206+
}
207+
188208
public Stream<String> successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) {
189209
return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i)));
190210
}
@@ -291,6 +311,10 @@ public IntStream predecessorIntStream(String vertexLabel, String prefix) {
291311
return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
292312
}
293313

314+
public long[] predecessorIntersect(long vertexId, long[] vertexIds) {
315+
return intersect(vertexIds, predecessors(vertexId));
316+
}
317+
294318
public Stream<Entry<String, Long>> predecessorTopLevelDomainCounts(String vertexLabel) {
295319
return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel));
296320
}
@@ -360,6 +384,67 @@ public static String getTopLevelDomain(String reversedDomainName) {
360384
return reversedDomainName;
361385
}
362386

387+
388+
/** Intersection of two sorted lists */
389+
public static long[] intersect(long[] a, long[] b) {
390+
int m = a.length;
391+
int n = b.length;
392+
LongArrayList res = new LongArrayList(Integer.min(m, n));
393+
int i = 0, j = 0;
394+
while (i < m && j < n) {
395+
if (a[i] < b[j]) {
396+
i++;
397+
} else if (a[i] > b[j]) {
398+
j++;
399+
} else {
400+
res.add(a[i]);
401+
i++;
402+
j++;
403+
}
404+
}
405+
return res.toArray(new long[0]);
406+
}
407+
408+
/** Intersection of two sorted lists */
409+
public static long[] intersect(long[] a, int[] b) {
410+
int m = a.length;
411+
int n = b.length;
412+
LongArrayList res = new LongArrayList(Integer.min(m, n));
413+
int i = 0, j = 0;
414+
while (i < m && j < n) {
415+
if (a[i] < b[j]) {
416+
i++;
417+
} else if (a[i] > b[j]) {
418+
j++;
419+
} else {
420+
res.add(a[i]);
421+
i++;
422+
j++;
423+
}
424+
}
425+
return res.toArray(new long[0]);
426+
}
427+
428+
/** Difference of two sorted lists: a \ b */
429+
public static long[] difference(long[] a, long[] b) {
430+
int m = a.length;
431+
int n = b.length;
432+
LongArrayList res = new LongArrayList(Integer.max(m, n));
433+
int i = 0, j = 0;
434+
while (i < m) {
435+
if (j >= n || a[i] < b[j]) {
436+
res.add(a[i]);
437+
i++;
438+
} else if (a[i] > b[j]) {
439+
j++;
440+
} else {
441+
i++;
442+
j++;
443+
}
444+
}
445+
return res.toArray(new long[0]);
446+
}
447+
363448
/**
364449
* Get the registered domain for a host name based on the ICANN section of the
365450
* <a href="https://www.publicsuffix.org/">public suffix list</a>.
@@ -414,4 +499,31 @@ public static String getRegisteredDomainReversed(String reversedHostName, boolea
414499
public static String reverseDomainName(String domainName) {
415500
return HostToDomainGraph.reverseHost(domainName);
416501
}
502+
503+
public void subgraphMetrics(long[] nodes) {
504+
long totalInlinks = 0, totalOutlinks = 0, arcsInCluster = 0, clusterInlinks = 0, clusterOutlinks = 0;
505+
for (long i : nodes) {
506+
int nInlinks = indegree(i);
507+
totalInlinks += nInlinks;
508+
int nOutlinks = outdegree(i);
509+
totalOutlinks += nOutlinks;
510+
int inClusterInlinks = predecessorIntersect(i, nodes).length;
511+
arcsInCluster += inClusterInlinks;
512+
clusterInlinks += nInlinks - inClusterInlinks;
513+
int inClusterOutlinks = successorIntersect(i, nodes).length;
514+
// Note: we do only count in-cluster inlinks (but not outlinks)
515+
// as in-cluster arcs. Otherwise we would count arcs twice.
516+
clusterOutlinks += nOutlinks - inClusterOutlinks;
517+
}
518+
LOG.info("Subgraph metrics:");
519+
LOG.info("\tnodes = {}", nodes.length);
520+
LOG.info("\tarcs = {} (counting only arcs connecting subgraph nodes)", arcsInCluster);
521+
LOG.info("\tavgdegree = {} (average degree in subgraph)", (double) arcsInCluster / nodes.length);
522+
LOG.info("\tinlinks = {} (links from the outer graph into the subgraph)", clusterInlinks);
523+
LOG.info("\toutlinks = {} (links from the subgraph to outer nodes)", clusterOutlinks);
524+
LOG.info("\ttotal inlinks = {} (all inlinks)", totalInlinks);
525+
LOG.info("\ttotal outlinks = {} (all outlinks)", totalOutlinks);
526+
LOG.info("\tnodes linked = {} (outer nodes linked from subgraph)", sharedSuccessors(nodes, 1, nodes.length).length);
527+
LOG.info("\tnodes linking = {} (outer nodes linking to subgraph)", sharedPredecessors(nodes, 1, nodes.length).length);
528+
}
417529
}

0 commit comments

Comments
 (0)