Skip to content

Commit f44043b

Browse files
committed
Node merge
1 parent ac6cd8e commit f44043b

9 files changed

Lines changed: 335 additions & 41 deletions

File tree

pipeline/ingestion/src/main/java/org/datacommons/ingestion/data/CacheReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ public NodesEdges parseArcRow(String row, Counter mcfNodesWithoutTypeCounter) {
141141
Node.builder()
142142
.subjectId(nodeId)
143143
.value(nodeValue)
144-
.bytes(bytes)
144+
.bytes(bytes != null ? bytes.toByteArray() : new byte[0])
145145
.name(entity.getName())
146146
.types(types)
147147
.build());

pipeline/ingestion/src/main/java/org/datacommons/ingestion/data/Edge.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ public class Edge implements Serializable {
1414
private String objectId;
1515
private String provenance;
1616

17+
@SuppressWarnings("unused")
18+
private Edge() {}
19+
1720
// Private constructor to enforce use of Builder
1821
private Edge(Builder builder) {
1922
this.subjectId = builder.subjectId;

pipeline/ingestion/src/main/java/org/datacommons/ingestion/data/GraphReader.java

Lines changed: 158 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package org.datacommons.ingestion.data;
22

3-
import com.google.cloud.ByteArray;
43
import com.google.cloud.spanner.Mutation;
54
import java.io.IOException;
65
import java.io.Serializable;
@@ -9,8 +8,10 @@
98
import java.util.Collections;
109
import java.util.List;
1110
import java.util.Map;
11+
import java.util.Set;
1212
import org.apache.beam.sdk.Pipeline;
1313
import org.apache.beam.sdk.metrics.Counter;
14+
import org.apache.beam.sdk.transforms.Combine;
1415
import org.apache.beam.sdk.transforms.Create;
1516
import org.apache.beam.sdk.transforms.DoFn;
1617
import org.apache.beam.sdk.transforms.Flatten;
@@ -40,6 +41,118 @@ public class GraphReader implements Serializable {
4041
private static final String DATCOM_AGGREGATE = "DataCommonsAggregate";
4142
private static final String IMPORT_METADATA_FILE = "import_metadata_mcf.mcf";
4243

44+
public static PCollection<Node> combineNodes(PCollection<Node> nodes) {
45+
return nodes
46+
.apply(
47+
"MapNodesToKV",
48+
ParDo.of(
49+
new DoFn<Node, KV<String, Node>>() {
50+
@ProcessElement
51+
public void processElement(
52+
@Element Node node, OutputReceiver<KV<String, Node>> receiver) {
53+
receiver.output(KV.of(node.getSubjectId(), node));
54+
}
55+
}))
56+
.apply(
57+
"CombineNodes",
58+
Combine.perKey(
59+
new Combine.CombineFn<Node, List<Node>, Node>() {
60+
@Override
61+
public List<Node> createAccumulator() {
62+
return new ArrayList<>();
63+
}
64+
65+
@Override
66+
public List<Node> addInput(List<Node> accumulator, Node input) {
67+
accumulator.add(input);
68+
return accumulator;
69+
}
70+
71+
@Override
72+
public List<Node> mergeAccumulators(Iterable<List<Node>> accumulators) {
73+
List<Node> merged = new ArrayList<>();
74+
for (List<Node> acc : accumulators) {
75+
merged.addAll(acc);
76+
}
77+
return merged;
78+
}
79+
80+
@Override
81+
public Node extractOutput(List<Node> accumulator) {
82+
if (accumulator.isEmpty()) return null;
83+
Node first = accumulator.get(0);
84+
Node.Builder builder =
85+
Node.builder()
86+
.subjectId(first.getSubjectId())
87+
.value(first.getValue())
88+
.name(first.getName())
89+
.types(first.getTypes())
90+
.bytes(first.getBytes());
91+
92+
Set<String> types = new java.util.TreeSet<>();
93+
for (Node n : accumulator) {
94+
types.addAll(n.getTypes());
95+
if (!n.getValue().isEmpty()) {
96+
builder.value(n.getValue());
97+
}
98+
if (!n.getName().isEmpty()) {
99+
builder.name(n.getName());
100+
}
101+
if (n.getBytes().length > 0) {
102+
builder.bytes(n.getBytes());
103+
}
104+
}
105+
if (types.size() > 1 && types.contains("ProvisionalNode")) {
106+
types.remove("ProvisionalNode");
107+
}
108+
builder.types(new ArrayList<>(types));
109+
return builder.build();
110+
}
111+
}))
112+
.apply(
113+
"ExtractNodes",
114+
ParDo.of(
115+
new DoFn<KV<String, Node>, Node>() {
116+
@ProcessElement
117+
public void processElement(
118+
@Element KV<String, Node> element, OutputReceiver<Node> receiver) {
119+
receiver.output(element.getValue());
120+
}
121+
}));
122+
}
123+
124+
public static PCollection<Mutation> nodeToMutations(
125+
PCollection<Node> nodes, SpannerClient spannerClient) {
126+
return nodes.apply(
127+
"NodesToMutations",
128+
ParDo.of(
129+
new DoFn<Node, Mutation>() {
130+
@ProcessElement
131+
public void processElement(@Element Node node, OutputReceiver<Mutation> receiver) {
132+
Mutation mutation = spannerClient.toNodeMutation(node);
133+
if (mutation != null) {
134+
receiver.output(mutation);
135+
}
136+
}
137+
}));
138+
}
139+
140+
public static PCollection<Mutation> edgeToMutations(
141+
PCollection<Edge> edges, SpannerClient spannerClient) {
142+
return edges.apply(
143+
"EdgesToMutations",
144+
ParDo.of(
145+
new DoFn<Edge, Mutation>() {
146+
@ProcessElement
147+
public void processElement(@Element Edge edge, OutputReceiver<Mutation> receiver) {
148+
Mutation mutation = spannerClient.toEdgeMutation(edge);
149+
if (mutation != null) {
150+
receiver.output(mutation);
151+
}
152+
}
153+
}));
154+
}
155+
43156
public static List<Node> graphToNodes(McfGraph graph, Counter mcfNodesWithoutTypeCounter) {
44157
List<Node> nodes = new ArrayList<>();
45158
for (Map.Entry<String, PropertyValues> nodeEntry : graph.getNodesMap().entrySet()) {
@@ -79,10 +192,11 @@ public static List<Node> graphToNodes(McfGraph graph, Counter mcfNodesWithoutTyp
79192
node = Node.builder();
80193
node.subjectId(PipelineUtils.generateObjectValueKey(val.getValue()));
81194
if (PipelineUtils.storeValueAsBytes(entry.getKey())) {
82-
node.bytes(ByteArray.copyFrom(PipelineUtils.compressString(val.getValue())));
195+
node.bytes(PipelineUtils.compressString(val.getValue()));
83196
} else {
84197
node.value(val.getValue());
85198
}
199+
node.types(List.of(ValueType.TEXT.toString()));
86200
nodes.add(node.build());
87201
}
88202
}
@@ -101,13 +215,13 @@ public static PCollection<McfGraph> getProvenanceMcf(
101215
String defaultProvenance =
102216
"Node: dcid:dc/base/" + importName + "\n" + "typeOf: dcid:Provenance\n";
103217
mcfList.add(GraphUtils.convertToGraph(defaultProvenance));
218+
// try {
219+
// mcfList.add(GraphUtils.convertToGraph(PipelineUtils.getGCSFileContent(metadataFile)));
220+
// } catch (IOException e) {
221+
// LOGGER.warn("Failed to read provenance metadata file: " + e.getMessage());
222+
// }
104223
try {
105-
mcfList.addAll(GraphUtils.readMcfString(PipelineUtils.getGcsFileContent(metadataFile)));
106-
} catch (IOException e) {
107-
LOGGER.warn("Failed to read provenance metadata file: " + e.getMessage());
108-
}
109-
try {
110-
mcfList.addAll(GraphUtils.readMcfString(PipelineUtils.getGcsFileContent(provenanceFile)));
224+
mcfList.add(GraphUtils.convertToGraph(PipelineUtils.getGCSFileContent(provenanceFile)));
111225
} catch (IOException e) {
112226
LOGGER.warn("Failed to read provenance metadata file: " + e.getMessage());
113227
}
@@ -214,6 +328,42 @@ public void processElement(
214328
}));
215329
}
216330

331+
public static PCollection<Node> mcfToNodes(
332+
PCollection<McfGraph> graph, Counter nodeCounter, Counter mcfNodesWithoutTypeCounter) {
333+
return graph.apply(
334+
"McfToNodes",
335+
ParDo.of(
336+
new DoFn<McfGraph, Node>() {
337+
@ProcessElement
338+
public void processElement(@Element McfGraph element, OutputReceiver<Node> receiver) {
339+
List<Node> nodes = graphToNodes(element, mcfNodesWithoutTypeCounter);
340+
for (Node node : nodes) {
341+
// LOGGER.info("Node: {}", node.toString());
342+
receiver.output(node);
343+
}
344+
nodeCounter.inc(nodes.size());
345+
}
346+
}));
347+
}
348+
349+
public static PCollection<Edge> mcfToEdges(
350+
PCollection<McfGraph> graph, String provenance, Counter edgeCounter) {
351+
return graph.apply(
352+
"McfToEdges",
353+
ParDo.of(
354+
new DoFn<McfGraph, Edge>() {
355+
@ProcessElement
356+
public void processElement(@Element McfGraph element, OutputReceiver<Edge> receiver) {
357+
List<Edge> edges = graphToEdges(element, provenance);
358+
for (Edge edge : edges) {
359+
receiver.output(edge);
360+
// LOGGER.info("Edge : {}", edge.toString());
361+
}
362+
edgeCounter.inc(edges.size());
363+
}
364+
}));
365+
}
366+
217367
public static PCollection<KV<String, Mutation>> graphToNodes(
218368
PCollection<McfGraph> graph,
219369
SpannerClient spannerClient,

pipeline/ingestion/src/main/java/org/datacommons/ingestion/data/Node.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package org.datacommons.ingestion.data;
22

3-
import com.google.cloud.ByteArray;
43
import java.io.Serializable;
4+
import java.util.Arrays;
55
import java.util.List;
66
import java.util.Objects;
77
import org.apache.beam.sdk.coders.DefaultCoder;
@@ -16,10 +16,13 @@ public class Node implements Serializable {
1616

1717
private String subjectId;
1818
private String value;
19-
private ByteArray bytes;
19+
private byte[] bytes;
2020
private String name;
2121
private List<String> types;
2222

23+
@SuppressWarnings("unused")
24+
private Node() {}
25+
2326
// Private constructor to enforce use of Builder
2427
private Node(Builder builder) {
2528
this.subjectId = builder.subjectId;
@@ -41,7 +44,7 @@ public String getValue() {
4144
return value;
4245
}
4346

44-
public ByteArray getBytes() {
47+
public byte[] getBytes() {
4548
return bytes;
4649
}
4750

@@ -60,7 +63,7 @@ public boolean equals(Object o) {
6063
Node node = (Node) o;
6164
return Objects.equals(subjectId, node.subjectId)
6265
&& Objects.equals(value, node.value)
63-
&& Objects.equals(bytes, node.bytes)
66+
&& Arrays.equals(bytes, node.bytes)
6467
&& Objects.equals(name, node.name)
6568
&& Objects.equals(types, node.types);
6669
}
@@ -74,13 +77,13 @@ public int hashCode() {
7477
public String toString() {
7578
return String.format(
7679
"Node{subjectId='%s', value='%s', bytes='%s', name='%s', types=%s}",
77-
subjectId, value, bytes, name, types);
80+
subjectId, value, Arrays.toString(bytes), name, types);
7881
}
7982

8083
public static class Builder {
8184
private String subjectId = "";
8285
private String value = "";
83-
private ByteArray bytes = null;
86+
private byte[] bytes = new byte[0];
8487
private String name = "";
8588
private List<String> types = List.of();
8689

@@ -96,7 +99,7 @@ public Builder value(String value) {
9699
return this;
97100
}
98101

99-
public Builder bytes(ByteArray bytes) {
102+
public Builder bytes(byte[] bytes) {
100103
this.bytes = bytes;
101104
return this;
102105
}

pipeline/ingestion/src/main/java/org/datacommons/ingestion/pipeline/ImportGroupPipeline.java

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@
1111
import org.apache.beam.sdk.metrics.Counter;
1212
import org.apache.beam.sdk.metrics.Metrics;
1313
import org.apache.beam.sdk.options.PipelineOptionsFactory;
14+
import org.apache.beam.sdk.transforms.Create;
1415
import org.apache.beam.sdk.transforms.Flatten;
1516
import org.apache.beam.sdk.transforms.Values;
1617
import org.apache.beam.sdk.transforms.Wait;
1718
import org.apache.beam.sdk.values.PCollection;
1819
import org.apache.beam.sdk.values.PCollectionList;
1920
import org.apache.beam.sdk.values.PCollectionTuple;
21+
import org.datacommons.ingestion.data.Edge;
2022
import org.datacommons.ingestion.data.GraphReader;
23+
import org.datacommons.ingestion.data.Node;
2124
import org.datacommons.ingestion.spanner.SpannerClient;
2225
import org.datacommons.pipeline.util.PipelineUtils;
2326
import org.datacommons.proto.Mcf.McfGraph;
@@ -95,9 +98,10 @@ public static void main(String[] args) {
9598
GraphReader.getProvenanceMcf(
9699
options.getStorageBucketId(), importName, latestVersion, pipeline);
97100

98-
PCollection<Mutation> deleteMutations =
99-
GraphReader.getDeleteMutations(importName, provenance, pipeline, spannerClient);
100-
deleteMutationList.add(deleteMutations);
101+
// PCollection<Mutation> deleteMutations =
102+
// GraphReader.getDeleteMutations(importName, provenance, pipeline, spannerClient);
103+
// deleteMutationList.add(deleteMutations);
104+
101105
// Read schema mcf files and combine MCF nodes, and convert to spanner mutations (Node/Edge).
102106
String graphPath =
103107
latestVersion.replaceAll("/+$", "")
@@ -115,13 +119,23 @@ public static void main(String[] args) {
115119
.and(provenanceMcf)
116120
.apply("FlattenSchema", Flatten.pCollections());
117121

118-
PCollection<Mutation> edgeMutations =
119-
GraphReader.graphToEdges(schemaMcf, provenance, spannerClient, edgeCounter)
120-
.apply("ExtractEdgeMutations", Values.create());
122+
PCollection<Edge> edges = GraphReader.mcfToEdges(schemaMcf, provenance, edgeCounter);
123+
124+
PCollection<Mutation> edgeMutations = GraphReader.edgeToMutations(edges, spannerClient);
125+
126+
PCollection<Node> newNodes =
127+
GraphReader.mcfToNodes(schemaMcf, nodeCounter, nodeInvalidTypeCounter);
128+
129+
PCollection<Node> existingNodes = spannerClient.readExistingNodes(newNodes);
130+
131+
PCollection<Node> mergedNodes =
132+
PCollectionList.of(newNodes)
133+
.and(existingNodes)
134+
.apply("FlattenNodes", Flatten.pCollections());
135+
136+
PCollection<Node> finalNodes = GraphReader.combineNodes(mergedNodes);
121137

122-
PCollection<Mutation> nodeMutations =
123-
GraphReader.graphToNodes(schemaMcf, spannerClient, nodeCounter, nodeInvalidTypeCounter)
124-
.apply("ExtractEdgeMutations", Values.create());
138+
PCollection<Mutation> nodeMutations = GraphReader.nodeToMutations(finalNodes, spannerClient);
125139

126140
nodeMutationList.add(nodeMutations);
127141
edgeMutationList.add(edgeMutations);
@@ -135,8 +149,9 @@ public static void main(String[] args) {
135149
obsMutationList.add(observationMutations);
136150
}
137151
PCollection<Mutation> deleteMutations =
138-
PCollectionList.of(deleteMutationList)
139-
.apply("FlattenDeleteMutations", Flatten.pCollections());
152+
pipeline.apply(Create.empty(org.apache.beam.sdk.values.TypeDescriptor.of(Mutation.class)));
153+
// PCollectionList.of(deleteMutationList)
154+
// .apply("FlattenDeleteMutations", Flatten.pCollections());
140155
SpannerWriteResult deleted =
141156
deleteMutations.apply("DeleteImportsFromSpanner", spannerClient.getWriteTransform());
142157
// Write the mutations to spanner.

0 commit comments

Comments
 (0)