diff --git a/src/it/unimi/dsi/webgraph/Transform.java b/src/it/unimi/dsi/webgraph/Transform.java index 45cc8b5..f9bdbdc 100644 --- a/src/it/unimi/dsi/webgraph/Transform.java +++ b/src/it/unimi/dsi/webgraph/Transform.java @@ -17,12 +17,7 @@ package it.unimi.dsi.webgraph; -import java.io.BufferedOutputStream; -import java.io.DataInput; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; +import java.io.*; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; @@ -1199,7 +1194,7 @@ public int[] successorArray() { final int numPairs = this.numPairs; // Neither quicksort nor heaps are stable, so we reestablish order here. IntArrays.quickSort(successor, 0, numPairs); - if (numPairs!= 0) { + if (numPairs != 0) { int p = 0; for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j]; outdegree = p + 1; @@ -1271,6 +1266,290 @@ protected void finalize() throws Throwable { } + public static class ArcLabelledBatchGraph extends ArcLabelledImmutableSequentialGraph { + private final int n; + private final long numArcs; + private final ObjectArrayList batches; + private final ObjectArrayList labelBatches; + private final Label prototype; + private final LabelMergeStrategy labelMergeStrategy; + + public ArcLabelledBatchGraph(int n, long numArcs, ObjectArrayList batches, ObjectArrayList labelBatches, Label prototype, final LabelMergeStrategy labelMergeStrategy) { + this.n = n; + this.numArcs = numArcs; + this.batches = batches; + this.labelBatches = labelBatches; + this.prototype = prototype; + this.labelMergeStrategy = labelMergeStrategy; + } + + @Override + public int numNodes() { return n; } + @Override + public long numArcs() { return numArcs; } + @Override + public boolean hasCopiableIterators() { return true; } + + class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { + /** The buffer size. We can't make it too big—there's two per batch, per thread. */ + private static final int STD_BUFFER_SIZE = 64 * 1024; + private final int[] refArray; + private final InputBitStream[] batchIbs; + private final InputBitStream[] labelInputBitStream; + private final int[] inputStreamLength; + private final int[] prevTarget; + + // The indirect queue used to merge the batches. + private final IntHeapSemiIndirectPriorityQueue queue; + /** The limit for {@link #hasNext()}. */ + private final int hasNextLimit; + + /** The last returned node (-1 if no node has been returned yet). */ + private int last; + /** The outdegree of the current node (valid if {@link #last} is not -1). */ + private int outdegree; + /** The number of pairs associated with the current node (valid if {@link #last} is not -1). */ + private int numPairs; + /** The successors of the current node (valid if {@link #last} is not -1); + * only the first {@link #outdegree} entries are meaningful. */ + private int[] successor; + /** The labels of the arcs going out of the current node (valid if {@link #last} is not -1); + * only the first {@link #outdegree} entries are meaningful. */ + private Label[] label; + + public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { + this(upperBound, null, null, null, null, null, -1, -1, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); + } + + public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { + this.hasNextLimit = Math.min(n, upperBound) - 1; + this.last = last; + this.outdegree = outdegree; + this.successor = successor; + this.label = label; + batchIbs = new InputBitStream[batches.size()]; + labelInputBitStream = new InputBitStream[batches.size()]; + + if (refArray == null) { + this.refArray = new int[batches.size()]; + this.prevTarget = new int[batches.size()]; + this.inputStreamLength = new int[batches.size()]; + Arrays.fill(this.prevTarget, -1); + queue = new IntHeapSemiIndirectPriorityQueue(this.refArray); + // We open all files and load the first element into the reference array. + for(int i = 0; i < batches.size(); i++) { + batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); + this.inputStreamLength[i] = batchIbs[i].readDelta(); + this.refArray[i] = batchIbs[i].readDelta(); + queue.enqueue(i); + } + } + else { + this.refArray = refArray; + this.prevTarget = prevTarget; + this.inputStreamLength = inputStreamLength; + queue = new IntHeapSemiIndirectPriorityQueue(refArray); + + for(int i = 0; i < refArray.length; i++) { + if (baseIbs[i] != null) { + batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + batchIbs[i].position(baseIbs[i].position()); + labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); + labelInputBitStream[i].position(baseLabelInputBitStream[i].position()); + queue.enqueue(i); + } + } + } + } + + @Override + public ArcLabelledNodeIterator copy(final int upperBound) { + try { + if (last == -1) return new InternalArcLabelledNodeIterator(upperBound); + else return new InternalArcLabelledNodeIterator(upperBound, batchIbs, labelInputBitStream, + refArray.clone(), prevTarget.clone(), inputStreamLength.clone(), last, outdegree(), Arrays.copyOf(successor, outdegree()), Arrays.copyOf(label, outdegree())); + } + catch (final IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public int outdegree() { + if (last == -1) throw new IllegalStateException(); + sortSuccessors(); + return outdegree; + } + + @Override + public boolean hasNext() { + return last < hasNextLimit; + } + + @Override + public int nextInt() { + if (! hasNext()) throw new NoSuchElementException(); + last++; + int d = 0; + outdegree = -1; + int i; + + try { + /* We extract elements from the queue as long as their target is equal + * to last. If during the process we exhaust a batch, we close it. */ + + while(! queue.isEmpty() && refArray[i = queue.first()] == last) { + successor = IntArrays.grow(successor, d + 1); + successor[d] = (prevTarget[i] += batchIbs[i].readDelta() + 1); + label = ObjectArrays.grow(label, d + 1); + label[d] = prototype.copy(); + label[d].fromBitStream(labelInputBitStream[i], last); + + if (--inputStreamLength[i] == 0) { + queue.dequeue(); + batchIbs[i].close(); + batchIbs[i] = null; + labelInputBitStream[i].close(); + labelInputBitStream[i] = null; + } + else { + // We read a new source and update the queue. + final int sourceDelta = batchIbs[i].readDelta(); + if (sourceDelta != 0) { + refArray[i] += sourceDelta; + prevTarget[i] = -1; + queue.changed(); + } + } + d++; + } + + numPairs = d; + } + catch(final IOException e) { + e.printStackTrace(); + throw new RuntimeException(this + " " + e); + } + + return last; + } + + @Override + public int[] successorArray() { + if (last == -1) throw new IllegalStateException(); + if (outdegree == -1) sortSuccessors(); + return successor; + } + + @Override + public Label[] labelArray() { + if (last == -1) throw new IllegalStateException(); + if (outdegree == -1) sortSuccessors(); + return super.labelArray(); + } + + @Override + public LabelledArcIterator successors() { + if (last == -1) throw new IllegalStateException(); + if (outdegree == -1) sortSuccessors(); + return new LabelledArcIterator() { + int last = -1; + + @Override + public Label label() { + return label[last]; + } + + @Override + public int nextInt() { + if (last + 1 == outdegree) return -1; + return successor[++last]; + } + + @Override + public int skip(final int k) { + final int toSkip = Math.min(k, outdegree - last - 1); + last += toSkip; + return toSkip; + } + }; + } + + @SuppressWarnings("deprecation") + @Override + protected void finalize() throws Throwable { + try { + for(final InputBitStream ibs: batchIbs) if (ibs != null) ibs.close(); + for(final InputBitStream ibs: labelInputBitStream) if (ibs != null) ibs.close(); + } + finally { + super.finalize(); + } + } + + private void sortSuccessors() { + // Compute outdegree + if (outdegree == -1) { + final int numPairs = this.numPairs; + // Neither quicksort nor heaps are stable, so we reestablish order here. + it.unimi.dsi.fastutil.Arrays.quickSort(0, numPairs, (x, y) -> Integer.compare(successor[x], successor[y]), + (x, y) -> { + final int t = successor[x]; + successor[x] = successor[y]; + successor[y] = t; + final Label l = label[x]; + label[x] = label[y]; + label[y] = l; + }); + + if (numPairs != 0) { + // Avoid returning the duplicate arcs + int p = 0; + for (int j = 1; j < numPairs; j++) { + if (successor[p] != successor[j]) { + successor[++p] = successor[j]; + } else if (labelMergeStrategy != null) { + label[p] = labelMergeStrategy.merge(label[p], label[j]); + } + } + outdegree = p + 1; + } + else outdegree = 0; + } + } + + } + + @Override + public ArcLabelledNodeIterator nodeIterator() { + try { + return new InternalArcLabelledNodeIterator(Integer.MAX_VALUE); + } + catch (final IOException e) { + throw new RuntimeException(e); + } + } + + @SuppressWarnings("deprecation") + @Override + protected void finalize() throws Throwable { + try { + for(final File f : batches) f.delete(); + for(final File f : labelBatches) f.delete(); + } + finally { + super.finalize(); + } + } + + @Override + public Label prototype() { + return prototype; + } + } + + /** Sorts the given source and target arrays w.r.t. the target and stores them in a temporary file. * * @param n the index of the last element to be sorted (exclusive). @@ -1319,79 +1598,114 @@ else if (target[i] != target[i - 1]) { return u; } - /** Sorts the given source and target arrays w.r.t. the target and stores them in two temporary files. - * An additional positionable input bit stream is provided that contains labels, starting at given positions. - * Labels are also written onto the appropriate file. + /** + * Sorts the given source and target arrays w.r.t. the target and stores them in two temporary files. An additional + * positionable input bit stream is provided that contains labels, starting at given positions. Labels are also + * written onto the appropriate file. * * @param n the index of the last element to be sorted (exclusive). * @param source the source array. * @param target the target array. - * @param start the array containing the bit position (within the given input stream) where the label of the arc starts. + * @param start the array containing the bit position (within the given input stream) where the label of the arc + * starts. * @param labelBitStream the positionable bit stream containing the labels. * @param tempDir a temporary directory where to store the sorted arrays. * @param batches a list of files to which the batch file will be added. * @param labelBatches a list of files to which the label batch file will be added. + * @param labelMergeStrategy + * @return the number of pairs in the batch (might be less than n because duplicates are eliminated). */ - private static void processTransposeBatch(final int n, final int[] source, final int[] target, final long[] start, - final InputBitStream labelBitStream, final File tempDir, final List batches, final List labelBatches, - final Label prototype) throws IOException { + public static int processTransposeBatch(final int n, final int[] source, final int[] target, final long[] start, + final InputBitStream labelBitStream, final File tempDir, final List batches, final List labelBatches, + Label prototype, final LabelMergeStrategy labelMergeStrategy) throws IOException { it.unimi.dsi.fastutil.Arrays.parallelQuickSort(0, n, (x,y) -> { - final int t = Integer.compare(source[x], source[y]); - if (t != 0) return t; - return Integer.compare(target[x], target[y]); - }, - (x, y) -> { - int t = source[x]; - source[x] = source[y]; - source[y] = t; - t = target[x]; - target[x] = target[y]; - target[y] = t; - final long u = start[x]; - start[x] = start[y]; - start[y] = u; - }); + final int t = Integer.compare(source[x], source[y]); + if (t != 0) return t; + return Integer.compare(target[x], target[y]); + }, + (x, y) -> { + int t = source[x]; + source[x] = source[y]; + source[y] = t; + t = target[x]; + target[x] = target[y]; + target[y] = t; + final long u = start[x]; + start[x] = start[y]; + start[y] = u; + }); final File batchFile = File.createTempFile("batch", ".bitstream", tempDir); batchFile.deleteOnExit(); batches.add(batchFile); final OutputBitStream batch = new OutputBitStream(batchFile); + final File labelFile = File.createTempFile("label-", ".bits", tempDir); + labelFile.deleteOnExit(); + labelBatches.add(labelFile); + final OutputBitStream labelObs = new OutputBitStream(labelFile); + + // Used to handle duplicate arcs with different labels + final Label otherPrototype = prototype.copy(); + + int u = 0; + if (n != 0) { // Compute unique pairs - batch.writeDelta(n); + u = 1; + for(int i = n - 1; i-- != 0;) if (source[i] != source[i + 1] || target[i] != target[i + 1]) u++; + batch.writeDelta(u); + int prevSource = source[0]; batch.writeDelta(prevSource); batch.writeDelta(target[0]); + labelBitStream.position(start[0]); + prototype.fromBitStream(labelBitStream, source[0]); + for(int i = 1; i < n; i++) { if (source[i] != prevSource) { batch.writeDelta(source[i] - prevSource); batch.writeDelta(target[i]); prevSource = source[i]; + + prototype.toBitStream(labelObs, target[i - 1]); + labelBitStream.position(start[i]); + prototype.fromBitStream(labelBitStream, source[i]); } else if (target[i] != target[i - 1]) { // We don't write duplicate pairs batch.writeDelta(0); batch.writeDelta(target[i] - target[i - 1] - 1); + + prototype.toBitStream(labelObs, target[i - 1]); + labelBitStream.position(start[i]); + prototype.fromBitStream(labelBitStream, source[i]); + } + else { + // Duplicate arcs, overwrite the label with either the new label encountered or merging the two labels. + labelBitStream.position(start[i]); + + if (labelMergeStrategy != null) { + otherPrototype.fromBitStream(labelBitStream, source[i]); + prototype = labelMergeStrategy.merge(otherPrototype, prototype); + } + else { + prototype.fromBitStream(labelBitStream, source[i]); + } } } + + prototype.toBitStream(labelObs, target[n - 1]); } + else batch.writeDelta(0); batch.close(); - - final File labelFile = File.createTempFile("label-", ".bits", tempDir); - labelFile.deleteOnExit(); - labelBatches.add(labelFile); - final OutputBitStream labelObs = new OutputBitStream(labelFile); - for (int i = 0; i < n; i++) { - labelBitStream.position(start[i]); - prototype.fromBitStream(labelBitStream, source[i]); - prototype.toBitStream(labelObs, target[i]); - } labelObs.close(); + + return u; } /** Returns an immutable graph obtained by reversing all arcs in g, using an offline method. @@ -1701,7 +2015,7 @@ public static ArcLabelledImmutableGraph transposeOffline(final ArcLabelledImmuta if (j == batchSize) { obs.flush(); - processTransposeBatch(batchSize, source, target, start, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype); + processTransposeBatch(batchSize, source, target, start, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype, null); fbos = new FastByteArrayOutputStream(); obs = new OutputBitStream(fbos); //ALERT here we should re-use j = 0; @@ -1714,7 +2028,7 @@ public static ArcLabelledImmutableGraph transposeOffline(final ArcLabelledImmuta if (j != 0) { obs.flush(); - processTransposeBatch(j, source, target, start, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype); + processTransposeBatch(j, source, target, start, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype, null); } if (pl != null) { @@ -1725,239 +2039,10 @@ public static ArcLabelledImmutableGraph transposeOffline(final ArcLabelledImmuta final long numArcs = m; // Now we return an immutable graph whose nodeIterator() merges the batches on the fly. - return new ArcLabelledImmutableSequentialGraph() { - @Override - public int numNodes() { return n; } - @Override - public long numArcs() { return numArcs; } - @Override - public boolean hasCopiableIterators() { return true; } - - class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator { - /** The buffer size. We can't make it too big—there's two per batch, per thread. */ - private static final int STD_BUFFER_SIZE = 64 * 1024; - private final int[] refArray; - private final InputBitStream[] batchIbs; - private final InputBitStream[] labelInputBitStream; - private final int[] inputStreamLength; - private final int[] prevTarget; - - // The indirect queue used to merge the batches. - private final IntHeapSemiIndirectPriorityQueue queue; - /** The limit for {@link #hasNext()}. */ - private final int hasNextLimit; - - /** The last returned node (-1 if no node has been returned yet). */ - private int last; - /** The outdegree of the current node (valid if {@link #last} is not -1). */ - private int outdegree; - /** The successors of the current node (valid if {@link #last} is not -1); - * only the first {@link #outdegree} entries are meaningful. */ - private int[] successor; - /** The labels of the arcs going out of the current node (valid if {@link #last} is not -1); - * only the first {@link #outdegree} entries are meaningful. */ - private Label[] label; - - public InternalArcLabelledNodeIterator(final int upperBound) throws IOException { - this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY); - } - - public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException { - this.hasNextLimit = Math.min(n, upperBound) - 1; - this.last = last; - this.outdegree = outdegree; - this.successor = successor; - this.label = label; - batchIbs = new InputBitStream[batches.size()]; - labelInputBitStream = new InputBitStream[batches.size()]; - - if (refArray == null) { - this.refArray = new int[batches.size()]; - this.prevTarget = new int[batches.size()]; - this.inputStreamLength = new int[batches.size()]; - Arrays.fill(this.prevTarget, -1); - queue = new IntHeapSemiIndirectPriorityQueue(this.refArray); - // We open all files and load the first element into the reference array. - for(int i = 0; i < batches.size(); i++) { - batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); - labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); - this.inputStreamLength[i] = batchIbs[i].readDelta(); - this.refArray[i] = batchIbs[i].readDelta(); - queue.enqueue(i); - } - } - else { - this.refArray = refArray; - this.prevTarget = prevTarget; - this.inputStreamLength = inputStreamLength; - queue = new IntHeapSemiIndirectPriorityQueue(refArray); - - for(int i = 0; i < refArray.length; i++) { - if (baseIbs[i] != null) { - batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); - batchIbs[i].position(baseIbs[i].position()); - labelInputBitStream[i] = new InputBitStream(labelBatches.get(i), STD_BUFFER_SIZE); - labelInputBitStream[i].position(baseLabelInputBitStream[i].position()); - queue.enqueue(i); - } - } - } - } - - @Override - public int outdegree() { - if (last == -1) throw new IllegalStateException(); - return outdegree; - } - - @Override - public boolean hasNext() { - return last < hasNextLimit; - } - - @Override - public int nextInt() { - last++; - int d = 0; - int i; - - try { - /* We extract elements from the queue as long as their target is equal - * to last. If during the process we exhaust a batch, we close it. */ - - while(! queue.isEmpty() && refArray[i = queue.first()] == last) { - successor = IntArrays.grow(successor, d + 1); - successor[d] = (prevTarget[i] += batchIbs[i].readDelta() + 1); - label = ObjectArrays.grow(label, d + 1); - label[d] = prototype.copy(); - label[d].fromBitStream(labelInputBitStream[i], last); - - if (--inputStreamLength[i] == 0) { - queue.dequeue(); - batchIbs[i].close(); - labelInputBitStream[i].close(); - batchIbs[i] = null; - labelInputBitStream[i] = null; - } - else { - // We read a new source and update the queue. - final int sourceDelta = batchIbs[i].readDelta(); - if (sourceDelta != 0) { - refArray[i] += sourceDelta; - prevTarget[i] = -1; - queue.changed(); - } - } - d++; - } - // Neither quicksort nor heaps are stable, so we reestablish order here. - it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]), - (x, y) -> { - final int t = successor[x]; - successor[x] = successor[y]; - successor[y] = t; - final Label l = label[x]; - label[x] = label[y]; - label[y] = l; - }); - } - catch(final IOException e) { - throw new RuntimeException(e); - } - - outdegree = d; - return last; - } - - @Override - public int[] successorArray() { - if (last == -1) throw new IllegalStateException(); - return successor; - } - - @SuppressWarnings("deprecation") - @Override - protected void finalize() throws Throwable { - try { - for(final InputBitStream ibs: batchIbs) if (ibs != null) ibs.close(); - for(final InputBitStream ibs: labelInputBitStream) if (ibs != null) ibs.close(); - } - finally { - super.finalize(); - } - } - - @Override - public LabelledArcIterator successors() { - if (last == -1) throw new IllegalStateException(); - return new LabelledArcIterator() { - int last = -1; - - @Override - public Label label() { - return label[last]; - } - - @Override - public int nextInt() { - if (last + 1 == outdegree) return -1; - return successor[++last]; - } - - @Override - public int skip(final int k) { - final int toSkip = Math.min(k, outdegree - last - 1); - last += toSkip; - return toSkip; - } - }; - } - - - @Override - public ArcLabelledNodeIterator copy(final int upperBound) { - try { - if (last == -1) return new InternalArcLabelledNodeIterator(upperBound); - else return new InternalArcLabelledNodeIterator(upperBound, batchIbs, labelInputBitStream, - refArray.clone(), prevTarget.clone(), inputStreamLength.clone(), last, outdegree, Arrays.copyOf(successor, outdegree), Arrays.copyOf(label, outdegree)); - } - catch (final IOException e) { - throw new RuntimeException(e); - } - } - } - - - @Override - public ArcLabelledNodeIterator nodeIterator() { - try { - return new InternalArcLabelledNodeIterator(Integer.MAX_VALUE); - } - catch (final IOException e) { - throw new RuntimeException(e); - } - } - - @SuppressWarnings("deprecation") - @Override - protected void finalize() throws Throwable { - try { - for(final File f : batches) f.delete(); - for(final File f : labelBatches) f.delete(); - } - finally { - super.finalize(); - } - } - @Override - public Label prototype() { - return prototype; - } - - }; + // We don't need a merge strategy because a transposition never introduces duplicates + return new ArcLabelledBatchGraph(n, numArcs, batches, labelBatches, prototype, null); } - /** Returns an immutable graph obtained by reversing all arcs in g. * *

This method can process {@linkplain ImmutableGraph#loadOffline(CharSequence) offline graphs}. @@ -2602,8 +2687,8 @@ public static void main(final String args[]) throws IOException, IllegalArgument "transposeOffline sourceBasename destBasename [batchSize] [tempDir]\n" + "symmetrize sourceBasename [transposeBasename] destBasename\n" + "symmetrizeOffline sourceBasename destBasename [batchSize] [tempDir]\n" + - "simplifyOffline sourceBasename destBasename [batchSize] [tempDir]\n" + - "simplify sourceBasename transposeBasename destBasename\n" + + "simplifyOffline sourceBasename destBasename [batchSize] [tempDir]\n" + + "simplify sourceBasename transposeBasename destBasename\n" + "union source1Basename source2Basename destBasename [strategy]\n" + "compose source1Basename source2Basename destBasename [semiring]\n" + "gray sourceBasename destBasename\n" + @@ -2630,8 +2715,8 @@ public static void main(final String args[]) throws IOException, IllegalArgument new Switch("ascii", 'a', "ascii", "Maps are in ASCII form (one integer per line)."), new UnflaggedOption("transform", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The transformation to be applied."), new UnflaggedOption("param", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The remaining parameters."), - } - ); + } + ); final JSAPResult jsapResult = jsap.parse(args); if (jsap.messagePrinted()) System.exit(1); diff --git a/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java new file mode 100644 index 0000000..80a1a29 --- /dev/null +++ b/src/it/unimi/dsi/webgraph/labelling/ScatteredLabelledArcsASCIIGraph.java @@ -0,0 +1,941 @@ +/* + * Copyright (C) 2011-2023 Sebastiano Vigna + * + * This program and the accompanying materials are made available under the + * terms of the GNU Lesser General Public License v2.1 or later, + * which is available at + * http://www.gnu.org/licenses/old-licenses/lgpl-2.1-standalone.html, + * or the Apache Software License 2.0, which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. + * + * SPDX-License-Identifier: LGPL-2.1-or-later OR Apache-2.0 + */ + +package it.unimi.dsi.webgraph.labelling; + +import com.martiansoftware.jsap.*; +import it.unimi.dsi.Util; +import it.unimi.dsi.fastutil.bytes.ByteArrays; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream; +import it.unimi.dsi.fastutil.longs.Long2IntFunction; +import it.unimi.dsi.fastutil.objects.Object2IntFunction; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; +import it.unimi.dsi.lang.MutableString; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.sux4j.mph.GOV3Function; +import it.unimi.dsi.webgraph.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.concurrent.TimeUnit; +import java.util.zip.GZIPInputStream; + +import static it.unimi.dsi.webgraph.Transform.processTransposeBatch; +import static it.unimi.dsi.webgraph.labelling.ArcLabelledImmutableGraph.UNDERLYINGGRAPH_SUFFIX; + +/** + * An {@link ArcLabelledImmutableGraph} that corresponds to a labelled graph stored as a scattered list of arcs. + * + *

+ * A scattered list of arcs describes a graph in a fairly loose way. Each line contains a + * labelled arc specified as two node identifiers and a label separated by whitespace (but we suggest exactly one TAB + * character). + * + *

+ * In the standard description, node identifiers can be in the range + * [-263..263): they will be remapped in a compact identifier space by + * assigning to each newly seen identifier a new node number. The list of identifiers in order of + * appearance is available in {@link #ids}. Lines can be empty, or comments starting with + * #. Characters following the target will be discarded with a warning. + * Similarly, the labels can be in the range [-263..263) and will be saved + * as-is in gamma coding, in case of duplicates only the last new label will be considered, + * this behaviour can be changed by providing more parameters. + * + *

+ * Warning: Lines not conforming the above specification will cause an error to be + * logged, but will be otherwise ignored. + * + *

+ * Alternatively, you can + * {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, Object2LongFunction, Charset, int, boolean) + * provide} an {@link Object2LongFunction Object2LongFunction<String>} with default return value + * -1 that will be used to map identifiers to node numbers, along with a {@link Charset} to parse + * lines and the number of nodes of the graph (which must be a strict upper bound for the largest + * value returned by the function). Note that in principle an {@link Object2IntFunction} would be + * sufficient, but we want to make easier using functions from Sux4J such as {@link GOV3Function}. + * + *

+ * Additionally, the resulting graph can be symmetrized, and its loops be removed, using + * {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, boolean, boolean, int, File, ProgressLogger) + * suitable constructor options}. + * + *

+ * You can provide {@linkplain #ScatteredLabelledArcsASCIIGraph(InputStream, labelPrototype, labelMapping, labelMergeStrategy) + * suitable constructor options} a {@link Label} as prototype, a {@link LabelMapping} as a way to + * convert the written labels to object of the prototype's type and a {@link LabelMergeStrategy} + * to handle the case of identical arcs with different labels. + * + *

+ * This class has no load method, and its main method converts a scattered-arcs representation + * directly into a {@link BVGraph}. + * + *

Using {@link ScatteredLabelledArcsASCIIGraph} to convert your data

+ * + *

+ * A simple (albeit rather inefficient) way to import data into WebGraph is using ASCII graphs + * specified by scattered arcs. Suppose you create the following file, named + * example.arcs: + * + *

+ *  # My graph
+ *  -1 15 100
+ *  15 2 200
+ *  2 -1 300 This will cause a warning to be logged
+ *  OOPS! (This will cause an error to be logged)
+ *  -1 2 400
+ * 
+ * + * Then, the command + * + *
+ *  java it.unimi.dsi.webgraph.ScatteredLabelledArcsASCIIGraph example < example.arcs
+ * 
+ * + * will produce a compressed labelled graph in {@link it.unimi.dsi.webgraph.BVGraph} format. + * The underlying graph will be saved with basename example-underlying. + * The file example.ids will contain the list of longs -1, 15, 2. + * The node with identifer -1 will be the node 0 in the output graph, the node with identifier + * 15 will be node 1, and the node with identifier 2 will be node 2. The graph example + * will thus have three nodes and four arcs (viz., <0,1>, <0,2>, <1,2> and + * <2,0>). The labels will be saved as example.labels in the order of visit + * of the arcs, the offset example.labeloffsets relay the offset of each specific label, + * because in general labels are not written in a fixed number of bits. + * + *

Memory requirements

+ * + *

+ * To convert node identifiers to node numbers, instances of this class use a custom map that in the + * worst case will require + * 19.5×2⌈log(4n/3)⌉ ≤ 52n bytes, + * where n is the number of distinct identifiers. Storing batches of arcs in memory + * requires 8 bytes per arc. + */ + +public class ScatteredLabelledArcsASCIIGraph extends ImmutableSequentialGraph { + /** + * The default batch size. + */ + public static final int DEFAULT_BATCH_SIZE = 1000000; + /** + * The default label prototype. + */ + public static final Label DEFAULT_LABEL_PROTOTYPE = new GammaCodedIntLabel("FOO"); + /** + * The default label mapping function. + */ + public static final LabelMapping DEFAULT_LABEL_MAPPING = (label, st) -> ((GammaCodedIntLabel) label).value = Integer.parseInt((String) st); + + private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredLabelledArcsASCIIGraph.class); + private final static boolean DEBUG = false; + + /** + * The extension of the identifier file (a binary list of longs). + */ + private static final String IDS_EXTENSION = ".ids"; + /** + * The labelled batch graph used to return node iterators. + */ + private final Transform.ArcLabelledBatchGraph arcLabelledBatchGraph; + /** + * The list of identifiers in order of appearance. + */ + public long[] ids; + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is) throws IOException { + this(is, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping) throws IOException { + this(is, labelPrototype, labelMapping, null, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy) throws IOException { + this(is, labelPrototype, labelMapping, labelMergeStrategy, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize) throws IOException { + this(is, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + this(is, null, null, -1, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, symmetrize, noLoops, batchSize, tempDir, pl); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a standard scattered list of arcs. + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + this(is, null, null, -1, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, pl); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final boolean symmetrize) throws IOException { + this(is, function, charset, n, DEFAULT_LABEL_PROTOTYPE, DEFAULT_LABEL_MAPPING, null, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, false); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, DEFAULT_BATCH_SIZE); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, final Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir) throws IOException { + this(is, function, charset, n, labelPrototype, labelMapping, labelMergeStrategy, symmetrize, noLoops, batchSize, tempDir, null); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param is an input stream containing a scattered list of arcs. + * @param function an explicitly provided function from string representing nodes to node numbers, or + * null for the standard behaviour. + * @param charset a character set that will be used to read the identifiers passed to function, or + * null for ISO-8859-1 (used only if function is not null). + * @param n the number of nodes of the graph (used only if function is not null). + * @param labelPrototype an example of the labels contained in the graph. + * @param labelMapping a function mapping string into the label defined by the prototype. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final InputStream is, final Object2LongFunction function, Charset charset, final int n, final Label labelPrototype, final LabelMapping labelMapping, final LabelMergeStrategy labelMergeStrategy, final boolean symmetrize, final boolean noLoops, final int batchSize, final File tempDir, final ProgressLogger pl) throws IOException { + @SuppressWarnings("resource") + final FastBufferedInputStream fbis = new FastBufferedInputStream(is); + ScatteredArcsASCIIGraph.Id2NodeMap map = new ScatteredArcsASCIIGraph.Id2NodeMap(); + + int numNodes = -1; + if (charset == null) charset = StandardCharsets.ISO_8859_1; + + int j; + int[] source = new int[batchSize], target = new int[batchSize]; + long[] labelStart = new long[batchSize]; + FastByteArrayOutputStream fbos = new FastByteArrayOutputStream(); + OutputBitStream obs = new OutputBitStream(fbos); + final ObjectArrayList batches = new ObjectArrayList<>(), labelBatches = new ObjectArrayList<>(); + final Label prototype = labelPrototype.copy(); + + if (pl != null) { + pl.itemsName = "labelled arcs"; + pl.start("Creating sorted batches..."); + } + + j = 0; + long pairs = 0; // Number of pairs + byte[] array = new byte[1024]; + for (long line = 1; ; line++) { + int start = 0, len; + while ((len = fbis.readLine(array, start, array.length - start, FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { + start += len; + array = ByteArrays.grow(array, array.length + 1); + } + + if (len == -1) break; // EOF + + final int lineLength = start + len; + + if (DEBUG) + System.err.println("Reading line " + line + "... (" + new String(array, 0, lineLength, charset) + ")"); + + // Skip whitespace at the start of the line. + int offset = 0; + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + if (DEBUG) System.err.println("Skipping line " + line + "..."); + continue; // Whitespace line + } + + if (array[0] == '#') continue; + + // Scan source id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + int s; + + if (function == null) { + final long sl; + try { + sl = getLong(array, start, offset - start); + } catch (final RuntimeException e) { + // Discard up to the end of line + LOGGER.error("Error at line " + line + ": " + e.getMessage()); + continue; + } + + s = map.getNode(sl); + + if (DEBUG) System.err.println("Parsed source at line " + line + ": " + sl + " => " + s); + } else { + final String ss = new String(array, start, offset - start, charset); + final long sl = function.getLong(ss); + if (sl == -1) { + LOGGER.warn("Unknown source identifier " + ss + " at line " + line); + continue; + } + if (sl < 0 || sl >= n) + throw new IllegalArgumentException("Source node number out of range for node " + ss + ": " + sl); + s = (int)sl; + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + ss + " => " + s); + } + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + LOGGER.error("Error at line " + line + ": no target"); + continue; + } + + // Scan target id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + int t; + + if (function == null) { + final long tl; + try { + tl = getLong(array, start, offset - start); + } catch (final RuntimeException e) { + // Discard up to the end of line + LOGGER.error("Error at line " + line + ": " + e.getMessage()); + continue; + } + + t = map.getNode(tl); + + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + tl + " => " + t); + } else { + final String ts = new String(array, start, offset - start, charset); + final long tl = function.getLong(ts); + if (tl == -1) { + LOGGER.warn("Unknown target identifier " + ts + " at line " + line); + continue; + } + + if (tl < 0 || tl >= n) + throw new IllegalArgumentException("Target node number out of range for node " + ts + ": " + tl); + t = (int)tl; + if (DEBUG) System.err.println("Parsed target at line " + line + ": " + ts + " => " + t); + } + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset == lineLength) { + LOGGER.error("Error at line " + line + ": no label"); + continue; + } + + // Scan label. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) offset++; + + final String ls = new String(array, start, offset - start, charset); + + // Insert current value into the prototype label. + labelMapping.apply(prototype, ls); + if (DEBUG) System.err.println("Parsed label at line " + line + ": " + ls + " => " + prototype.get()); + + // Skip whitespace after label. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') offset++; + + if (offset < lineLength) LOGGER.warn("Trailing characters ignored at line " + line); + + if (DEBUG) + System.err.println("Parsed labelled arc at line " + line + ": " + s + " -> " + t + " (" + prototype.get() + ")"); + + if (s != t || !noLoops) { + source[j] = s; + target[j] = t; + labelStart[j] = obs.writtenBits(); + prototype.toBitStream(obs, s); + j++; + + if (j == batchSize) { + obs.flush(); + pairs += processTransposeBatch(batchSize, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype, labelMergeStrategy); + fbos = new FastByteArrayOutputStream(); + obs = new OutputBitStream(fbos); + j = 0; + } + + if (symmetrize && s != t) { + source[j] = t; + target[j] = s; + labelStart[j] = obs.writtenBits(); + prototype.toBitStream(obs, t); + j++; + + if (j == batchSize) { + obs.flush(); + pairs += processTransposeBatch(batchSize, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype, labelMergeStrategy); + fbos = new FastByteArrayOutputStream(); + obs = new OutputBitStream(fbos); + j = 0; + } + } + + if (pl != null) pl.lightUpdate(); + } + } + + if (j != 0) { + obs.flush(); + pairs += processTransposeBatch(j, source, target, labelStart, new InputBitStream(fbos.array), tempDir, batches, labelBatches, prototype, labelMergeStrategy); + } + + if (pl != null) { + pl.done(); + logBatches(batches, pairs, pl); + } + + numNodes = function == null ? (int)map.size() : function.size(); + source = null; + target = null; + labelStart = null; + + if (function == null) { + ids = map.getIds(tempDir); + } + + this.arcLabelledBatchGraph = new Transform.ArcLabelledBatchGraph(function == null ? numNodes : n, pairs, batches, labelBatches, prototype, labelMergeStrategy); + } + + /** + * Creates a scattered-arcs ASCII graph. + * + * @param arcs an iterator returning the arcs as two-element arrays. + * @param function a function to map the long ids passed in arcs to int nodes. + * @param n the number of nodes of the graph (used only if function is not null). + * @param arcLabels a homogeneous iterator returning the labels in the same order as the arcs. + * @param labelMergeStrategy a merge strategy to apply when encountering duplicate arcs with different labels. + * @param symmetrize the new graph will be forced to be symmetric. + * @param noLoops the new graph will have no loops. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be allocated by + * this method. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(String, String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredLabelledArcsASCIIGraph(final Iterator arcs, final Long2IntFunction function, final int n, final Iterator