Skip to content

Commit 1e9ced9

Browse files
authored
[GH-2760] Extend OSM PBF reader to support additional metadata fields (#2776)
1 parent 0c17835 commit 1e9ced9

10 files changed

Lines changed: 337 additions & 15 deletions

File tree

spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,42 @@ public class DenseNodeExtractor implements Extractor {
2626
long latOffset;
2727
long lonOffset;
2828
long granularity;
29+
int dateGranularity;
2930
long firstId;
3031
long firstLat;
3132
long firstLon;
3233
Integer keyIndex;
3334

35+
// DenseInfo delta accumulators
36+
boolean hasDenseInfo;
37+
long firstTimestamp;
38+
long firstChangeset;
39+
int firstUid;
40+
int firstUserSid;
41+
3442
Osmformat.DenseNodes nodes;
3543

3644
public DenseNodeExtractor(
37-
Osmformat.DenseNodes nodes, long latOffset, long lonOffset, long granularity) {
45+
Osmformat.DenseNodes nodes,
46+
long latOffset,
47+
long lonOffset,
48+
long granularity,
49+
int dateGranularity) {
3850
this.firstId = 0;
3951
this.firstLat = 0;
4052
this.firstLon = 0;
4153
this.latOffset = latOffset;
4254
this.lonOffset = lonOffset;
4355
this.granularity = granularity;
56+
this.dateGranularity = dateGranularity;
4457
this.nodes = nodes;
4558
this.keyIndex = 0;
59+
60+
this.hasDenseInfo = nodes.hasDenseinfo() && nodes.getDenseinfo().getVersionCount() > 0;
61+
this.firstTimestamp = 0;
62+
this.firstChangeset = 0;
63+
this.firstUid = 0;
64+
this.firstUserSid = 0;
4665
}
4766

4867
public OsmNode extract(int idx, Osmformat.StringTable stringTable) {
@@ -63,7 +82,53 @@ private OsmNode parse(int idx, Osmformat.StringTable stringTable) {
6382

6483
HashMap<String, String> tags = parseTags(stringTable);
6584

66-
return new OsmNode(id, lat, lon, tags);
85+
OsmNode node = new OsmNode(id, lat, lon, tags);
86+
87+
if (hasDenseInfo) {
88+
Osmformat.DenseInfo denseInfo = nodes.getDenseinfo();
89+
90+
// version is NOT delta-encoded
91+
if (denseInfo.getVersionCount() > idx) {
92+
node.setVersion(denseInfo.getVersion(idx));
93+
}
94+
95+
// timestamp, changeset, uid, user_sid are delta-encoded
96+
if (denseInfo.getTimestampCount() > idx) {
97+
long timestamp = denseInfo.getTimestamp(idx) + firstTimestamp;
98+
firstTimestamp = timestamp;
99+
node.setTimestamp(timestamp * dateGranularity);
100+
}
101+
102+
if (denseInfo.getChangesetCount() > idx) {
103+
long changeset = denseInfo.getChangeset(idx) + firstChangeset;
104+
firstChangeset = changeset;
105+
node.setChangeset(changeset);
106+
}
107+
108+
if (denseInfo.getUidCount() > idx) {
109+
int uid = denseInfo.getUid(idx) + firstUid;
110+
firstUid = uid;
111+
node.setUid(uid);
112+
}
113+
114+
if (denseInfo.getUserSidCount() > idx) {
115+
int userSid = denseInfo.getUserSid(idx) + firstUserSid;
116+
firstUserSid = userSid;
117+
if (userSid > 0) {
118+
node.setUser(stringTable.getS(userSid).toStringUtf8());
119+
}
120+
}
121+
122+
// visible is NOT delta-encoded, and may not be present
123+
if (denseInfo.getVisibleCount() > idx) {
124+
node.setVisible(denseInfo.getVisible(idx));
125+
} else {
126+
// Per OSM PBF spec, missing 'visible' must be treated as true (especially in history files)
127+
node.setVisible(true);
128+
}
129+
}
130+
131+
return node;
67132
}
68133

69134
HashMap<String, String> parseTags(Osmformat.StringTable stringTable) {
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.sedona.sql.datasources.osmpbf.features;
20+
21+
import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
22+
import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
23+
24+
public class InfoResolver {
25+
26+
public static void populateInfo(
27+
OSMEntity entity,
28+
Osmformat.Info info,
29+
Osmformat.StringTable stringTable,
30+
int dateGranularity) {
31+
if (info == null) {
32+
return;
33+
}
34+
entity.setVersion(info.getVersion());
35+
entity.setTimestamp((long) info.getTimestamp() * dateGranularity);
36+
entity.setChangeset(info.getChangeset());
37+
entity.setUid(info.getUid());
38+
if (info.getUserSid() > 0) {
39+
entity.setUser(stringTable.getS(info.getUserSid()).toStringUtf8());
40+
}
41+
if (info.hasVisible()) {
42+
entity.setVisible(info.getVisible());
43+
} else {
44+
// Per OSM PBF spec, when HistoricalInformation is a required_feature,
45+
// missing "visible" must be treated as true. Default to true when absent.
46+
entity.setVisible(true);
47+
}
48+
}
49+
}

spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,8 @@ public OSMEntity next() {
7272

7373
Iterator<OSMEntity> resolveIterator() {
7474
return IteratorUtils.chainedIterator(
75-
new WayIterator(currentPrimitiveGroup.getWaysList(), primitiveBlock.getStringtable()),
76-
new RelationIterator(
77-
currentPrimitiveGroup.getRelationsList(), primitiveBlock.getStringtable()),
75+
new WayIterator(currentPrimitiveGroup.getWaysList(), primitiveBlock),
76+
new RelationIterator(currentPrimitiveGroup.getRelationsList(), primitiveBlock),
7877
new NodeIterator(currentPrimitiveGroup.getNodesList(), primitiveBlock),
7978
currentPrimitiveGroup.getDense() != null
8079
? new DenseNodeIterator(
@@ -84,7 +83,8 @@ Iterator<OSMEntity> resolveIterator() {
8483
currentPrimitiveGroup.getDense(),
8584
primitiveBlock.getLatOffset(),
8685
primitiveBlock.getLonOffset(),
87-
primitiveBlock.getGranularity()))
86+
primitiveBlock.getGranularity(),
87+
primitiveBlock.getDateGranularity()))
8888
: Collections.emptyIterator());
8989
}
9090
}

spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/NodeIterator.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.Iterator;
2323
import java.util.List;
2424
import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
25+
import org.apache.sedona.sql.datasources.osmpbf.features.InfoResolver;
2526
import org.apache.sedona.sql.datasources.osmpbf.features.TagsResolver;
2627
import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
2728
import org.apache.sedona.sql.datasources.osmpbf.model.OsmNode;
@@ -85,6 +86,13 @@ private OsmNode parse(int idx) {
8586
HashMap<String, String> tags =
8687
TagsResolver.resolveTags(node.getKeysCount(), node::getKeys, node::getVals, stringTable);
8788

88-
return new OsmNode(id, lat, lon, tags);
89+
OsmNode osmNode = new OsmNode(id, lat, lon, tags);
90+
91+
if (node.hasInfo()) {
92+
InfoResolver.populateInfo(
93+
osmNode, node.getInfo(), stringTable, primitiveBlock.getDateGranularity());
94+
}
95+
96+
return osmNode;
8997
}
9098
}

spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/RelationIterator.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.Iterator;
2323
import java.util.List;
2424
import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
25+
import org.apache.sedona.sql.datasources.osmpbf.features.InfoResolver;
2526
import org.apache.sedona.sql.datasources.osmpbf.features.TagsResolver;
2627
import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
2728
import org.apache.sedona.sql.datasources.osmpbf.model.Relation;
@@ -32,12 +33,15 @@ public class RelationIterator implements Iterator<OSMEntity> {
3233
long relationCount;
3334
List<Osmformat.Relation> relations;
3435
Osmformat.StringTable stringTable;
36+
Osmformat.PrimitiveBlock primitiveBlock;
3537

36-
public RelationIterator(List<Osmformat.Relation> relations, Osmformat.StringTable stringTable) {
38+
public RelationIterator(
39+
List<Osmformat.Relation> relations, Osmformat.PrimitiveBlock primitiveBlock) {
3740
this.idx = 0;
3841
this.relationCount = 0;
3942
this.relations = relations;
40-
this.stringTable = stringTable;
43+
this.stringTable = primitiveBlock.getStringtable();
44+
this.primitiveBlock = primitiveBlock;
4145

4246
if (relations != null) {
4347
this.relationCount = relations.size();
@@ -79,7 +83,14 @@ private Relation parse(Osmformat.Relation relation) {
7983
TagsResolver.resolveTags(
8084
relation.getKeysCount(), relation::getKeys, relation::getVals, stringTable);
8185

82-
return new Relation(relation.getId(), tags, refs, refTypes, roles);
86+
Relation relationEntity = new Relation(relation.getId(), tags, refs, refTypes, roles);
87+
88+
if (relation.hasInfo()) {
89+
InfoResolver.populateInfo(
90+
relationEntity, relation.getInfo(), stringTable, primitiveBlock.getDateGranularity());
91+
}
92+
93+
return relationEntity;
8394
}
8495

8596
private String[] resolveRefRoles(Osmformat.Relation relation) {

spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/WayIterator.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.Iterator;
2323
import java.util.List;
2424
import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
25+
import org.apache.sedona.sql.datasources.osmpbf.features.InfoResolver;
2526
import org.apache.sedona.sql.datasources.osmpbf.features.TagsResolver;
2627
import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
2728
import org.apache.sedona.sql.datasources.osmpbf.model.Way;
@@ -31,12 +32,14 @@ public class WayIterator implements Iterator<OSMEntity> {
3132
long waysCount;
3233
List<Osmformat.Way> ways;
3334
Osmformat.StringTable stringTable;
35+
Osmformat.PrimitiveBlock primitiveBlock;
3436

35-
public WayIterator(List<Osmformat.Way> ways, Osmformat.StringTable stringTable) {
37+
public WayIterator(List<Osmformat.Way> ways, Osmformat.PrimitiveBlock primitiveBlock) {
3638
this.idx = 0;
3739
this.waysCount = 0;
3840
this.ways = ways;
39-
this.stringTable = stringTable;
41+
this.stringTable = primitiveBlock.getStringtable();
42+
this.primitiveBlock = primitiveBlock;
4043

4144
if (ways != null) {
4245
this.waysCount = ways.size();
@@ -79,6 +82,13 @@ private Way parse(Osmformat.Way way) {
7982
HashMap<String, String> tags =
8083
TagsResolver.resolveTags(way.getKeysCount(), way::getKeys, way::getVals, stringTable);
8184

82-
return new Way(way.getId(), tags, refs);
85+
Way wayEntity = new Way(way.getId(), tags, refs);
86+
87+
if (way.hasInfo()) {
88+
InfoResolver.populateInfo(
89+
wayEntity, way.getInfo(), stringTable, primitiveBlock.getDateGranularity());
90+
}
91+
92+
return wayEntity;
8393
}
8494
}

spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/model/OSMEntity.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@ public class OSMEntity {
3131
private String[] refRoles;
3232
private String[] refTypes;
3333

34+
// Metadata fields from Info/DenseInfo
35+
private Integer version;
36+
private Long timestamp; // milliseconds since epoch
37+
private Long changeset;
38+
private Integer uid;
39+
private String user;
40+
private Boolean visible;
41+
3442
public OSMEntity(
3543
long id, double latitude, double longitude, HashMap<String, String> tags, String kind) {
3644
this.id = id;
@@ -93,4 +101,52 @@ public String[] getRefTypes() {
93101
public long getId() {
94102
return id;
95103
}
104+
105+
public Integer getVersion() {
106+
return version;
107+
}
108+
109+
public void setVersion(Integer version) {
110+
this.version = version;
111+
}
112+
113+
public Long getTimestamp() {
114+
return timestamp;
115+
}
116+
117+
public void setTimestamp(Long timestamp) {
118+
this.timestamp = timestamp;
119+
}
120+
121+
public Long getChangeset() {
122+
return changeset;
123+
}
124+
125+
public void setChangeset(Long changeset) {
126+
this.changeset = changeset;
127+
}
128+
129+
public Integer getUid() {
130+
return uid;
131+
}
132+
133+
public void setUid(Integer uid) {
134+
this.uid = uid;
135+
}
136+
137+
public String getUser() {
138+
return user;
139+
}
140+
141+
public void setUser(String user) {
142+
this.user = user;
143+
}
144+
145+
public Boolean getVisible() {
146+
return visible;
147+
}
148+
149+
public void setVisible(Boolean visible) {
150+
this.visible = visible;
151+
}
96152
}

spark/common/src/main/scala/org/apache/sedona/sql/datasources/osm/OsmPartitionReader.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,16 @@ case class OsmPartitionReader(
8989
if (entity.getRefTypes != null)
9090
ArrayData.toArrayData(entity.getRefTypes.map(x => UTF8String.fromString(x)))
9191
else null
92+
case "changeset" => entity.getChangeset
93+
case "timestamp" =>
94+
if (entity.getTimestamp != null)
95+
entity.getTimestamp * 1000L // ms to microseconds for Spark TimestampType
96+
else null
97+
case "uid" => entity.getUid
98+
case "user" =>
99+
if (entity.getUser != null) UTF8String.fromString(entity.getUser) else null
100+
case "version" => entity.getVersion
101+
case "visible" => entity.getVisible
92102
}
93103
}))
94104
}

spark/common/src/main/scala/org/apache/sedona/sql/datasources/osm/SchemaProvider.scala

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919
package org.apache.sedona.sql.datasources.osm
2020

21-
import org.apache.spark.sql.types.{ArrayType, DoubleType, LongType, MapType, StringType, StructField, StructType}
21+
import org.apache.spark.sql.types.{ArrayType, BooleanType, DoubleType, IntegerType, LongType, MapType, StringType, StructField, StructType, TimestampType}
2222

2323
trait SchemaProvider {
2424
def schema: StructType =
@@ -39,5 +39,11 @@ trait SchemaProvider {
3939
nullable = true),
4040
StructField("refs", ArrayType(LongType), nullable = true),
4141
StructField("ref_roles", ArrayType(StringType), nullable = true),
42-
StructField("ref_types", ArrayType(StringType), nullable = true)))
42+
StructField("ref_types", ArrayType(StringType), nullable = true),
43+
StructField("changeset", LongType, nullable = true),
44+
StructField("timestamp", TimestampType, nullable = true),
45+
StructField("uid", IntegerType, nullable = true),
46+
StructField("user", StringType, nullable = true),
47+
StructField("version", IntegerType, nullable = true),
48+
StructField("visible", BooleanType, nullable = true)))
4349
}

0 commit comments

Comments
 (0)