Merge pull request #23 from target/StringRegexCheck

samratmitra-0812 · web-flow · commit 88759077a1e8 · 2019-09-17T14:36:56.000+05:30
String regex check
diff --git a/README.md b/README.md
@@ -282,6 +282,17 @@ At least one of `minLength` or `maxLength` must be specified. The data type of `
 | `maxLength` | Integer | Upper bound of the length of the string, inclusive.
 | `threshold` | String | See above description of threshold.
 
+#### `stringRegexCheck`
+
+Takes 2 to 3 parameters, described in the table below. If the `column` value does not match the pattern specified by the `regex`, the check will fail.
+A value for `regex` must be specified. The data type of `column` must be String.
+
+| Arg         | Type   | Description                                                             |
+|-------------|--------|-------------------------------------------------------------------------|
+| `column`    | String | Table column to be checked. The DataType of the column must be a String |
+| `regex`     | String | POSIX regex.                                                            |
+| `threshold` | String | See above description of threshold.                                     |
+
 #### `rowCount`
 
 The minimum number of rows a table must have to pass the validator.
@@ -372,6 +383,15 @@ tables:
               column: occupation
               minLength: 1
               maxLength: 5
+
+      # stringRegexCheck - checks if the string in the column matches the pattern specified by `regex`, counts number of rows in which there is a mismatch.
+            - type: stringRegexCheck
+              column: occupation
+              regex: ^ENGINEER$ (matches the word ENGINEER)
+
+            - type: stringRegexCheck
+              column: occupation
+              regex: \w (matches any alphanumeric string)
 ```
 
 ## Working with OOZIE Workflows
diff --git a/src/main/scala/com/target/data_validator/validator/JsonDecoders.scala b/src/main/scala/com/target/data_validator/validator/JsonDecoders.scala
@@ -16,6 +16,7 @@ object JsonDecoders extends LazyLogging {
       case "rangeCheck" => RangeCheck.fromJson(c)
       case "uniqueCheck" => UniqueCheck.fromJson(c)
       case "stringLengthCheck" => StringLengthCheck.fromJson(c)
+      case "stringRegexCheck" => StringRegexCheck.fromJson(c)
       case x => logger.error(s"Unknown Check `$x` in config!")
         throw new RuntimeException(s"Unknown Check in config `$x`")
     }
diff --git a/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala b/src/main/scala/com/target/data_validator/validator/StringRegexCheck.scala
@@ -0,0 +1,97 @@
+package com.target.data_validator.validator
+
+import com.target.data_validator.{JsonEncoders, ValidatorError, VarSubstitution}
+import com.target.data_validator.JsonUtils.debugJson
+import com.target.data_validator.validator.ValidatorBase._
+import com.typesafe.scalalogging.LazyLogging
+import io.circe.{DecodingFailure, HCursor, Json}
+import io.circe.syntax._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{StringType, StructType}
+
+case class StringRegexCheck(
+                             column: String,
+                             regex: Option[Json],
+                             threshold: Option[String]
+                            ) extends RowBased {
+
+  override def substituteVariables(dict: VarSubstitution): ValidatorBase = {
+
+    val ret = StringRegexCheck(
+      getVarSub(column, "column", dict),
+      regex.map(getVarSubJson(_, "regex", dict)),
+      threshold.map(getVarSub(_, "threshold", dict))
+    )
+    getEvents.foreach(ret.addEvent)
+    ret
+  }
+
+  override def colTest(schema: StructType, dict: VarSubstitution): Expression = {
+
+    val colExp = UnresolvedAttribute(column)
+
+    val regexExpression = regex.map { r => RLike(colExp, createLiteralOrUnresolvedAttribute(StringType, r)) }
+
+    val ret = regexExpression match {
+         /*
+           RLike returns false if the column value is null.
+           To avoid counting null values as validation failures (like other validations),
+           an explicit non null check on the column value is required.
+          */
+         case Some(x) => And(Not(x), IsNotNull(colExp))
+         case _ => throw new RuntimeException("Must define a regex.")
+    }
+    logger.debug(s"Expr: $ret")
+    ret
+  }
+
+  override def configCheck(df: DataFrame): Boolean = {
+
+    // Verify if regex is specified.
+    val values = (regex::Nil).flatten
+    if (values.isEmpty) {
+      addEvent(ValidatorError("Must define a regex."))
+    }
+
+    // Verify that the data type of the specified column is a String.
+    val colType = findColumnInDataFrame(df, column)
+    if (colType.isDefined) {
+      val dataType = colType.get.dataType
+      if (!(dataType.isInstanceOf[StringType])) {
+        addEvent(ValidatorError(s"Data type of column '$column' must be String, but was found to be $dataType"))
+      }
+    }
+
+    failed
+  }
+
+  override def toJson: Json = {
+    import JsonEncoders.eventEncoder
+    val fields = Seq(
+      ("type", Json.fromString("stringRegexCheck")),
+      ("column", Json.fromString(column))
+    ) ++
+      regex.map(r => ("regex", r)) ++
+      Seq(
+        ("events", getEvents.asJson)
+      )
+    Json.obj(fields: _*)
+  }
+}
+
+object StringRegexCheck extends LazyLogging {
+  def fromJson(c: HCursor): Either[DecodingFailure, ValidatorBase] = {
+    val column = c.downField("column").as[String].right.get
+    val regex = c.downField("regex").as[Json].right.toOption
+    val threshold = c.downField("threshold").as[String].right.toOption
+
+    logger.debug(s"column: $column")
+    logger.debug(s"regex: $regex type: ${regex.getClass.getCanonicalName}")
+    logger.debug(s"threshold: $threshold type: ${threshold.getClass.getCanonicalName}")
+
+    c.focus.foreach {f => logger.info(s"StringRegexCheckJson: ${f.spaces2}")}
+    scala.util.Right(StringRegexCheck(column, regex, threshold))
+  }
+}
diff --git a/src/test/scala/com/target/data_validator/validator/Mocker.scala b/src/test/scala/com/target/data_validator/validator/Mocker.scala
@@ -0,0 +1,22 @@
+package com.target.data_validator.validator
+
+import com.target.data_validator._
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import io.circe.Json
+
+trait Mocker{
+
+  def mkDataFrame(spark: SparkSession, data: List[Row], schema: StructType): DataFrame = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
+
+  def mkParams(params: List[Tuple2[String, Any]] = List.empty): VarSubstitution = {
+    val dict = new VarSubstitution
+    params.foreach { pair =>
+      pair._2 match {
+        case p: Json => dict.add(pair._1, pair._2.asInstanceOf[Json])
+        case p: String => dict.addString(pair._1, pair._2.asInstanceOf[String])
+      }
+    }
+    dict
+  }
+}
diff --git a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ object JsonDecoders extends LazyLogging {`
`16`	`16`	`case "rangeCheck" => RangeCheck.fromJson(c)`
`17`	`17`	`case "uniqueCheck" => UniqueCheck.fromJson(c)`
`18`	`18`	`case "stringLengthCheck" => StringLengthCheck.fromJson(c)`
	`19`	`+ case "stringRegexCheck" => StringRegexCheck.fromJson(c)`
`19`	`20`	case x => logger.error(s"Unknown Check `$x` in config!")
`20`	`21`	throw new RuntimeException(s"Unknown Check in config `$x`")
`21`	`22`	`}`