elastacloud
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 7 additions & 6 deletions b/‎.github/workflows/main.yml‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 30 additions & 16 deletions b/‎README.md‎
Lines changed: 30 additions & 16 deletions
diff --git a/‎build.ps1‎
Lines changed: 13 additions & 2 deletions b/‎build.ps1‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎build.sbt‎
Lines changed: 18 additions & 8 deletions b/‎build.sbt‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎project/plugins.sbt‎
Lines changed: 1 addition & 0 deletions b/‎project/plugins.sbt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎…ala/com/bp/sds/cef/CefOutputWriter.scala‎ ‎…stacloud/spark/cef/CefOutputWriter.scala‎src/main/3.0/scala/com/bp/sds/cef/CefOutputWriter.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefOutputWriter.scala
Lines changed: 1 addition & 1 deletion b/‎…ala/com/bp/sds/cef/CefOutputWriter.scala‎ ‎…stacloud/spark/cef/CefOutputWriter.scala‎src/main/3.0/scala/com/bp/sds/cef/CefOutputWriter.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefOutputWriter.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎…/bp/sds/cef/CefOutputWriterBuilder.scala‎ ‎…d/spark/cef/CefOutputWriterBuilder.scala‎src/main/3.0/scala/com/bp/sds/cef/CefOutputWriterBuilder.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefOutputWriterBuilder.scala
Lines changed: 1 addition & 1 deletion b/‎…/bp/sds/cef/CefOutputWriterBuilder.scala‎ ‎…d/spark/cef/CefOutputWriterBuilder.scala‎src/main/3.0/scala/com/bp/sds/cef/CefOutputWriterBuilder.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefOutputWriterBuilder.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎…/sds/cef/CefPartitionReaderFactory.scala‎ ‎…park/cef/CefPartitionReaderFactory.scala‎src/main/scala/com/bp/sds/cef/CefPartitionReaderFactory.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefPartitionReaderFactory.scala
Lines changed: 1 addition & 1 deletion b/‎…/sds/cef/CefPartitionReaderFactory.scala‎ ‎…park/cef/CefPartitionReaderFactory.scala‎src/main/scala/com/bp/sds/cef/CefPartitionReaderFactory.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefPartitionReaderFactory.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎…n/3.2/scala/com/bp/sds/cef/CefScan.scala‎ ‎…/com/elastacloud/spark/cef/CefScan.scala‎src/main/3.2/scala/com/bp/sds/cef/CefScan.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefScan.scala
Lines changed: 1 addition & 1 deletion b/‎…n/3.2/scala/com/bp/sds/cef/CefScan.scala‎ ‎…/com/elastacloud/spark/cef/CefScan.scala‎src/main/3.2/scala/com/bp/sds/cef/CefScan.scala renamed to src/main/3.0/scala/com/elastacloud/spark/cef/CefScan.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/3.0/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala‎
Lines changed: 42 additions & 0 deletions b/‎src/main/3.0/scala/org/apache/spark/sql/catalyst/FileSourceOptions.scala‎
Lines changed: 42 additions & 0 deletions
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ main ]
   pull_request:
-    branches: [ main ]
+    types: [ opened, reopened ]
   workflow_dispatch:
 
 jobs:
@@ -14,21 +14,22 @@ jobs:
 
     strategy:
       matrix:
-        spark-version: [3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.1.3, 3.2.0, 3.2.2, 3.3.0]
+        spark-version: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.4.0, 3.4.1, 3.5.0 ]
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3.5.2
 
     - name: Set up JDK 1.8
-      uses: actions/setup-java@v1
+      uses: actions/setup-java@v3.11.0
       with:
-        java-version: 1.8
+        java-version: '8'
+        distribution: 'adopt'
 
     - name: Test and package
       run: sbt -DsparkVersion="${{ matrix.spark-version }}" clean compile test package
 
     - name: Upload the package
-      uses: actions/upload-artifact@v2.2.0
+      uses: actions/upload-artifact@v3.1.2
       with:
         path: ./target/**/spark-cef-reader*.jar
         if-no-files-found: warn
@@ -3,7 +3,21 @@
 A custom Spark data source supporting the [Common Event Format](https://support.citrix.com/article/CTX136146) V25
 standard for logging events.
 
-[![Spark library CI](https://github.com/bp/spark-cef-reader/actions/workflows/main.yml/badge.svg)](https://github.com/bp/spark-cef-reader/actions/workflows/main.yml)
+[![Spark library CI](https://github.com/elastacloud/spark-cef-reader/actions/workflows/main.yml/badge.svg)](https://github.com/elastacloud/spark-cef-reader/actions/workflows/main.yml)
+
+## Fork
+
+This is a fork taken from the original source at [https://github.com/bp/spark-cef-reader](https://github.com/bp/spark-cef-reader)
+which was created by the same authors as this fork.
+
+This fork has the following changes applied to it at the time of the fork. Subsequence changes can be viewed
+in the history of the source code and in the release notes.
+
+* Updated to include support for Spark 3.4 and 3.5
+* Rewrite of the options class to meet support for Spark 3.4
+* Renamed the package to not violate any trademarks and to ensure that this is seen as a derivative work
+
+This repository contains all history from the original source and has the same license applied.
 
 ## Supported Features
 
@@ -17,21 +31,20 @@ standard for logging events.
 ## Usage
 
 ```scala
-import com.bp.sds.cef._
 import org.apache.spark.sql.SparkSession
 
 val spark = SparkSession.builder().getOrCreate()
 
 // Read using provided data frame reader
 val df = spark.read
-  .option("maxRecords", "10000")  // Optional, default 10,000
-  .option("pivotFields", "true")  // Optional, default is false
+  .option("maxRecords", "10000") // Optional, default 10,000
+  .option("pivotFields", "true") // Optional, default is false
   .cef("/path/to/file.log")
 
 // Writing the data back out
 df.write
   .mode("overwrite")
-  .option("nullValue", "NA")      // Optional
+  .option("nullValue", "NA") // Optional
   .option("dateFormat", "millis") // Optional
   .cef("/path/to/output/file.log")
 
@@ -41,7 +54,7 @@ df.write
 val dfShort = spark.read.format("cef").load("/path/to/file.log")
 
 // Using the fully qualified name
-val dfFull = spark.read.format("com.bp.sds.cef").load("/path/to/file.log")
+val dfFull = spark.read.format("com.elastacloud.spark.cef").load("/path/to/file.log")
 
 // The path to the file may be an absolute path name, multiple path names, or a glob pattern.
 val dfGlob = spark.read.cef("/landing/events/year=2020/month=*/day=*/*.log.gz")
@@ -50,6 +63,8 @@ val dfGlob = spark.read.cef("/landing/events/year=2020/month=*/day=*/*.log.gz")
 Available for use in Spark SQL as well
 
 ```sql
+-- Note the use of backticks around the path
+
 SELECT
     *
 FROM
@@ -61,16 +76,15 @@ FROM
 The following options are available to pass to the data source, where they are not defined then the default value
 will be used.
 
-Option | Type | Default | Supported Actions | Purpose
------- | ---- | ------- | ----------------- | -------
-maxRecords | Integer | 10,000 | Read | The number of records to scan when inferring the schema. The data source will keep scanning until either the maximum number of records have been reached or there are no more files to scan.
-pivotFields | Boolean | false | Read | Scans for field pairs in the format of `key=value keyLabel=OtherKey` and pivots the data to `OtherKey=value`.
-defensiveMode | Boolean | false | Read | Used if a feed is known to violate the CEF spec. Adds overhead to the parsing so only use when there are known violations.
-nullValue | String | `-` | Read/Write | A value used in the CEF records which should be parsed as a `null` value.
-mode | ParseMode | Permissive | Read | Permitted values are `permissive`, `dropmalformed` and `failfast`. When used in `FailFast` mode the parser will throw an error on the first record exception found. When used in `Permissive` mode it will attempt to parse as much of the record as possible, with `null` values used for all other values. Using `dropmalformed` will simply drop any malformed records from the result. `Permissive` mode may be used in combination with the `corruptRecordColumnName` option.
-corruptRecordColumnName | String | `null` | Read | When used with `Permissive` mode the full record is stored in a column with the name provided. If null is provided then the full record is discarded. By providing a name the data source will append a column to the inferred schema.
-dateFormat | String | `MMM dd yyyy HH:mm:ss.SSS zzz` | Write | When writing data this option defines the format time use for timestamp values. The data source will check against CEF valid formats. Alternatively use `millis` to output using milliseconds from the epoch
-
+| Option                  | Type      | Default                        | Supported Actions | Purpose                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|-------------------------|-----------|--------------------------------|-------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| maxRecords              | Integer   | 10,000                         | Read              | The number of records to scan when inferring the schema. The data source will keep scanning until either the maximum number of records have been reached or there are no more files to scan.                                                                                                                                                                                                                                                                                       |
+| pivotFields             | Boolean   | false                          | Read              | Scans for field pairs in the format of `key=value keyLabel=OtherKey` and pivots the data to `OtherKey=value`.                                                                                                                                                                                                                                                                                                                                                                      |
+| defensiveMode           | Boolean   | false                          | Read              | Used if a feed is known to violate the CEF spec. Adds overhead to the parsing so only use when there are known violations.                                                                                                                                                                                                                                                                                                                                                         |
+| nullValue               | String    | `-`                            | Read/Write        | A value used in the CEF records which should be parsed as a `null` value.                                                                                                                                                                                                                                                                                                                                                                                                          |
+| mode                    | ParseMode | Permissive                     | Read              | Permitted values are `permissive`, `dropmalformed` and `failfast`. When used in `FailFast` mode the parser will throw an error on the first record exception found. When used in `Permissive` mode it will attempt to parse as much of the record as possible, with `null` values used for all other values. Using `dropmalformed` will simply drop any malformed records from the result. `Permissive` mode may be used in combination with the `corruptRecordColumnName` option. |
+| corruptRecordColumnName | String    | `null`                         | Read              | When used with `Permissive` mode the full record is stored in a column with the name provided. If null is provided then the full record is discarded. By providing a name the data source will append a column to the inferred schema.                                                                                                                                                                                                                                             |
+| dateFormat              | String    | `MMM dd yyyy HH:mm:ss.SSS zzz` | Write             | When writing data this option defines the format time use for timestamp values. The data source will check against CEF valid formats. Alternatively use `millis` to output using milliseconds from the epoch                                                                                                                                                                                                                                                                       |
 
 ### CEF supported date formats
 
 
@@ -1,17 +1,28 @@
-$versions = @("3.0.1", "3.0.2", "3.0.3", "3.1.1", "3.1.2", "3.2.0")
+$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1", "3.5.0")
 $jarPath = "./target/jars"
+$covPath = "./target/coverage"
 
 Write-Host "Clearing existing jar artefacts" -ForegroundColor Green
 if (Test-Path $jarPath) {
     Remove-Item -Path $jarPath -Force -Recurse
 }
 
+if (Test-Path $covPath) {
+    Remove-Item -Path $covPath -Force -Recurse
+}
+
 New-Item -Path $jarPath -ItemType Directory
+New-Item -Path $covPath -ItemType Directory
 
 foreach ($version in $versions) {
     Write-Host "Building for Spark version: $version" -ForegroundColor Green
-    & sbt -DsparkVersion="$version" clean compile test package
+    & sbt -DsparkVersion="$version" clean coverageOn compile test coverageReport coverageOff package
 }
 
 Write-Host "Copying jar files to $jarPath" -ForegroundColor Green
 Get-ChildItem -Filter "spark-cef*.jar" -Path ./target -Recurse | Copy-Item -Destination $jarPath
+
+Write-Host "Copying coverage information from most recent spark version to $covPath" -ForegroundColor Green
+$maxVersion = ($versions | Measure-Object -Maximum).Maximum
+Get-ChildItem -Path ".\target\spark-$maxVersion" -Recurse -Filter "scoverage-report" -Directory | Copy-Item -Destination .\target\coverage\ -Recurse
+Get-ChildItem -Path ".\target\spark-$maxVersion" -Recurse -Filter "cobertura.xml" -File | Copy-Item -Destination .\target\coverage\
@@ -6,11 +6,11 @@ val scalaTestVersion = settingKey[String]("ScalaTest version")
 
 name := "spark-cef-reader"
 version := "0.6-SNAPSHOT"
-organization := "com.bp"
+organization := "com.elastacloud"
 description := "CEF data source for Spark"
-homepage := Some(url("https://github.com/bp/spark-cef-reader"))
+homepage := Some(url("https://github.com/elastacloud/spark-cef-reader"))
 licenses += ("Apache License, Version 2.0", url("https://www.apache.org/licenses/LICENSE-2.0"))
-scmInfo := Some(ScmInfo(url("https://github.com/bp/spark-cef-reader"), "https://github.com/bp/spark-cef-reader.git"))
+scmInfo := Some(ScmInfo(url("https://github.com/elastacloud/spark-cef-reader"), "https://github.com/elastacloud/spark-cef-reader.git"))
 developers ++= List(
   Developer(id = "dazfuller", name = "Darren Fuller", email = "darren@elastacloud.com", url = url("https://github.com/elastacloud")),
   Developer(id = "azurecoder", name = "Richard Conway", email = "richard@elastacloud.com", url = url("https://github.com/elastacloud"))
@@ -32,8 +32,10 @@ Compile / unmanagedSourceDirectories ++= {
     Seq(baseDirectory.value / "src/main/3.0/scala")
   } else if (sparkVersion.value < "3.3.0") {
     Seq(baseDirectory.value / "src/main/3.2/scala")
-  } else {
+  } else if (sparkVersion.value < "3.4.0") {
     Seq(baseDirectory.value / "src/main/3.3/scala")
+  } else {
+    Seq(baseDirectory.value / "src/main/3.4/scala")
   }
 }
 
@@ -46,15 +48,23 @@ libraryDependencies ++= Seq(
   "org.scalatest" %% "scalatest" % scalaTestVersion.value % Test
 )
 
+coverageOutputCobertura := true
+coverageOutputHTML := true
+coverageMinimumStmtTotal := 70
+coverageFailOnMinimum := false
+coverageHighlighting := true
+
 // Define common settings for the library
 val commonSettings = Seq(
-  sparkVersion := System.getProperty("sparkVersion", "3.3.0"),
+  sparkVersion := System.getProperty("sparkVersion", "3.5.0"),
   scalaVersion := {
-    if (sparkVersion.value >= "3.2.0") {
+    if (sparkVersion.value < "3.2.0") {
+      "2.12.10"
+    } else if (sparkVersion.value < "3.4.0") {
       "2.12.14"
     } else {
-      "2.12.10"
+      "2.12.15"
     }
   },
-  scalaTestVersion := "3.2.13"
+  scalaTestVersion := "3.2.17"
 )
@@ -0,0 +1 @@
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3")
@@ -1,4 +1,4 @@
-package com.bp.sds.cef
+package com.elastacloud.spark.cef
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
@@ -1,4 +1,4 @@
-package com.bp.sds.cef
+package com.elastacloud.spark.cef
 
 import org.apache.hadoop.mapreduce.Job
 import org.apache.spark.sql.connector.write.LogicalWriteInfo
 
@@ -1,4 +1,4 @@
-package com.bp.sds.cef
+package com.elastacloud.spark.cef
 
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.catalyst.InternalRow
 
@@ -1,4 +1,4 @@
-package com.bp.sds.cef
+package com.elastacloud.spark.cef
 
 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.SparkSession
 
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst
+
+import org.apache.spark.sql.catalyst.FileSourceOptions.{IGNORE_CORRUPT_FILES, IGNORE_MISSING_FILES}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Common options for the file-based data source.
+ */
+class FileSourceOptions(
+                         @transient private val parameters: CaseInsensitiveMap[String])
+  extends Serializable {
+
+  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+
+  val ignoreCorruptFiles: Boolean = parameters.get(IGNORE_CORRUPT_FILES).map(_.toBoolean)
+    .getOrElse(SQLConf.get.ignoreCorruptFiles)
+
+  val ignoreMissingFiles: Boolean = parameters.get(IGNORE_MISSING_FILES).map(_.toBoolean)
+    .getOrElse(SQLConf.get.ignoreMissingFiles)
+}
+
+object FileSourceOptions {
+  val IGNORE_CORRUPT_FILES = "ignoreCorruptFiles"
+  val IGNORE_MISSING_FILES = "ignoreMissingFiles"
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3")`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-package com.bp.sds.cef`
	`1`	`+package com.elastacloud.spark.cef`
`2`	`2`
`3`	`3`	`import org.apache.hadoop.fs.Path`
`4`	`4`	`import org.apache.hadoop.mapreduce.TaskAttemptContext`