Merge pull request #53 from aws-samples/38-cqlreplicator-on-glue-add-preflight-check

nwheeler81 · web-flow · commit 7fa7f5682871 · 2023-12-18T12:47:20.000-05:00
fixed minor issues, added AWS region as a parameter, added the pre-fl…
diff --git a/glue/README.MD b/glue/README.MD
@@ -68,7 +68,7 @@ if you choose to replicate TTLs, updates, or offloading objects exceeding 1MB to
 Let's run the following command to replicate the workload from the Cassandra cluster to Amazon Keyspaces.
 
 ```shell
-   cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 \
+   cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --region us-west-2 \
                  --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator \ 
                  --trg-keyspace ks_test_cql_replicator --trg-table test_cql_replicator --inc-traffic
 ```
@@ -84,34 +84,34 @@ every 2 minutes. At peak traffic, it can reach up to 22,400 WCUs per second.
 
 ### Replicate near-real time updates and inserts
 ```shell
-   cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --writetime-column col3  \
-                 --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator \ 
+   cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --region us-west-2 \
+                 --writetime-column col3 --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator \ 
                  --trg-keyspace ks_test_cql_replicator --trg-table test_cql_replicator --inc-traffic
 ```
 
 ### Replicate with TTL
 the TTL feature should be [enabled](https://docs.aws.amazon.com/keyspaces/latest/devguide/TTL-how-it-works.html#ttl-howitworks_enabling) 
 on the target table before running the following command
 ```shell
-   cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --writetime-column col3  \
-                 --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator --ttl-column col3 \ 
+   cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --region us-west-2 \
+                 --writetime-column col3 --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator --ttl-column col3 \ 
                  --trg-keyspace ks_test_cql_replicator --trg-table test_cql_replicator --inc-traffic
 ```
 
 ### Offload large objects
 Before running the migration process configure [lifecycle](https://docs.aws.amazon.com/AmazonS3/latest/userguide/intro-lifecycle-rules.html)
 for the S3 bucket to delete objects after expiration.
 ```shell
-./cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --src-keyspace ks_test_cql_replicator 
-                --src-table test_cql_replicator --trg-keyspace ks_test_cql_replicator --trg-table test_cql_replicator \ 
-                --ttl-column col3 --offload-large-objects '{"column":"col1","bucket":"my-application-resource",
+./cqlreplicator --state run --tiles 8 --landing-zone s3://cql-replicator-1234567890-us-west-2 --src-keyspace ks_test_cql_replicator \
+                --src-table test_cql_replicator --trg-keyspace ks_test_cql_replicator --trg-table test_cql_replicator \
+                --region us-west-2 --ttl-column col3 --offload-large-objects '{"column":"col1","bucket":"my-application-resource",
                                           "prefix":"ks_test_cql_replicator/test_cql_replicator/col1","xref":"link"}'
 ```
 
 ## Stop migration process
 To stop migration process gracefully run the following command:
 ```shell
-   cqlreplicator --state request-stop --landing-zone s3://cql-replicator-1234567890-us-west-2 \ 
+   cqlreplicator --state request-stop --landing-zone s3://cql-replicator-1234567890-us-west-2 --region us-west-2 \
                  --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator
 ```
 
@@ -120,7 +120,7 @@ Customers can simply restart the migration process from the point where it was i
 In order, to restart failed CQLReplicator jobs, you need to re-run `--state run` with the same parameters.
 
 ## Get migration stats
-To get replicated row stats, run the following command:
+To obtain the number of rows replicated during the back filling phase, run the following command:
 ```shell
    cqlreplicator --state stats --landing-zone s3://cql-replicator-1234567890-us-west-2 \ 
                  --src-keyspace ks_test_cql_replicator --src-table test_cql_replicator
diff --git a/glue/bin/cqlreplicator b/glue/bin/cqlreplicator
@@ -6,7 +6,7 @@
 # Migration parameters
 MIGRATOR_VERSION=0.2
 JOB_NAME=CQLReplicator
-TILES=1
+TILES=2
 PROCESS_TYPE_DISCOVERY=discovery
 PROCESS_TYPE_REPLICATION=replication
 SOURCE_KS=ks_test_cql_replicator
@@ -23,7 +23,7 @@ DISCOVERED_TOTAL=0
 REPLICATED_TOTAL=0
 OFFLOAD_LARGE_OBJECTS_B64=$(echo "None" | base64)
 BASE_FOLDER=$(pwd -L)
-AWS_REGION="us-east-1"
+AWS_REGION=""
 SUBNET=""
 SG=""
 AZ=""
@@ -81,13 +81,17 @@ function check_input() {
   return 0
 }
 
-
 function check_discovery_runs() {
    local rs
-   rs=$(aws glue get-job-runs --job-name CQLReplicator --query 'JobRuns[?JobRunState==`RUNNING`] | [].Arguments | [?"--PROCESS_TYPE"==`discovery`]' | jq '.[0]["--SOURCE_TBL"] == "'"$SOURCE_TBL"'" and .[0]["--SOURCE_KS"] == "'"$SOURCE_KS"'"')
-   if [[ $rs = "true" ]]; then
-     #log "ERROR: Discovery job is already running for" $SOURCE_KS.$SOURCE_TBL
-     return 1
+   local mode
+   # mode = true, if discovery job is not running return 0
+   # mode = false, if discovery job is not running return 1
+   mode=$1
+   rs=$(aws glue get-job-runs --job-name CQLReplicator --region "$AWS_REGION" --query 'JobRuns[?JobRunState==`RUNNING`] | [].Arguments | [?"--PROCESS_TYPE"==`discovery`]' | jq '.[0]["--SOURCE_TBL"] == "'"$SOURCE_TBL"'" and .[0]["--SOURCE_KS"] == "'"$SOURCE_KS"'"')
+
+   if [[ $rs == "$mode" ]]; then
+       log "ERROR: The discovery job has failed, check AWS Glue logs"
+       exit 1
    fi
    return 0
 }
@@ -96,15 +100,23 @@ function check_replication_runs() {
    local tile
    local rs
    tile=$1
-   rs=$(aws glue get-job-runs --job-name CQLReplicator --query 'JobRuns[?JobRunState==`RUNNING`] | [].Arguments | [?"--PROCESS_TYPE"==`replication`]' | jq '.[0]["--SOURCE_TBL"] == "'"$SOURCE_TBL"'" and .[0]["--SOURCE_KS"] == "'"$SOURCE_KS"'" and .[]["--TILE"] == "'"$tile"'"' | grep true)
+   rs=$(aws glue get-job-runs --job-name CQLReplicator --region "$AWS_REGION" --query 'JobRuns[?JobRunState==`RUNNING`] | [].Arguments | [?"--PROCESS_TYPE"==`replication`]' | jq '.[0]["--SOURCE_TBL"] == "'"$SOURCE_TBL"'" and .[0]["--SOURCE_KS"] == "'"$SOURCE_KS"'" and .[]["--TILE"] == "'"$tile"'"' | grep true)
 
-   if [[ $rs = "true" ]]; then
+   if [[ $rs == "true" ]]; then
      #log "ERROR: Replication job is already running per tile $tile for" $SOURCE_KS.$SOURCE_TBL
      return 1
    fi
    return 0
 }
 
+function check_num_tiles() {
+  if [[ $TILES -lt 2 ]]; then
+        log "Total number of tiles should be => 2"
+        exit 1
+    fi
+    return 0
+}
+
 function progress {
   local current="$1"
   local total="$2"
@@ -124,6 +136,7 @@ function progress {
 }
 
 function barrier() {
+  flag_check_discovery_run="$1"
   while true
   do
     cnt=0
@@ -137,6 +150,11 @@ function barrier() {
     if [[ $cnt == "$TILES" ]]; then
       break
     fi
+    if [[ $flag_check_discovery_run == "true" ]]; then
+      # if the discovery job is not running then fail (return 1)
+      sleep 2
+      check_discovery_runs "false"
+    fi
   done
 }
 
@@ -152,6 +170,7 @@ function Usage_Exit {
   log "run - Start migration process"
   log "stats - Upload progress. Only for historical workload"
   log "request-stop - Stop migration process"
+  log "cleanup - Delete all CQLReplicator artifacts"
   exit 1
 }
 
@@ -161,9 +180,9 @@ function Clean_Up {
   aws s3 rb "$S3_LANDING_ZONE"
   local connection_name
   connection_name=$(aws glue get-job --job-name CQLReplicator --query 'Job.Connections.Connections[0]' --output text)
-  aws glue delete-connection --connection-name "$connection_name"
-  aws glue delete-job --job-name CQLReplicator
-  aws keyspaces delete-keyspace --keyspace-name migration
+  aws glue delete-connection --connection-name "$connection_name" --region "$AWS_REGION"
+  aws glue delete-job --job-name CQLReplicator --region "$AWS_REGION"
+  aws keyspaces delete-keyspace --keyspace-name migration --region "$AWS_REGION"
 }
 
 function Init {
@@ -262,22 +281,6 @@ function Init {
    "AvailabilityZone":"'$AZ'"}
    }' --region "$AWS_REGION" --endpoint https://glue."$AWS_REGION".amazonaws.com --output json
 
-  #glue_conn_name=$(echo cql-replicator-"$(uuidgen)" | tr ' [:upper:]' ' [:lower:]')
-  #  aws glue create-connection --connection-input '{
-  #   "Name":"'$glue_conn_name'",
-  #   "Description":"CQLReplicator connection to the C* cluster",
-  #   "ConnectionType":"CUSTOM",
-  #   "ConnectionProperties":{
-  #   "CONNECTOR_TYPE": "Spark",
-  #   "CONNECTOR_URL": "'$S3_LANDING_ZONE'/artifacts/spark-cassandra-connector-assembly_2.12-3.4.0.jar",
-  #   "CONNECTOR_CLASS_NAME": "org.apache.spark.sql.cassandra"
-  # },
-  #   "PhysicalConnectionRequirements":{
-  #   "SubnetId":"'$SUBNET'",
-  #   "SecurityGroupIdList":["'$SG'"],
-  #   "AvailabilityZone":"'$AZ'"}
-  # }' --region "$AWS_REGION" --endpoint https://glue."$AWS_REGION".amazonaws.com --output json
-
   # Create Glue Jobs
   aws glue create-job \
     --name "CQLReplicator" \
@@ -375,22 +378,21 @@ function Start_Discovery {
   check_input "$TARGET_KS" "ERROR: target keyspace name is empty, must be provided"
   check_input "$TARGET_TBL" "ERROR: target table name is empty, must be provided"
   check_input "$S3_LANDING_ZONE" "ERROR: landing zone must be provided"
+  check_num_tiles
 
   log "TILES:" "$TILES"
   log "SOURCE:" "$SOURCE_KS"."$SOURCE_TBL"
   log "TARGET:" "$TARGET_KS"."$TARGET_TBL"
   log "LANDING ZONE:" "$S3_LANDING_ZONE"
-  log "Writetime column:" $WRITETIME_COLUMN
-  log "TTL column:" $TTL_COLUMN
+  log "WRITE TIME COLUMN:" $WRITETIME_COLUMN
+  log "TTL COLUMN:" $TTL_COLUMN
   local workers=$((1 + TILES / 2))
   log "Checking if the discovery job is already running..."
-  check_discovery_runs
+  check_discovery_runs "true"
   if [ $? = 0 ]; then
     Delete_Stop_Event_D
     log "Starting the discovery job..."
-    #KEYS_PER_TILE=$(aws s3 cp "$S3_LANDING_ZONE"/"$SOURCE_KS"/"$SOURCE_TBL"/stats/discovery/0/count.json - | head | jq '.primaryKeys')
-    #log "Average primary keys per tile is $KEYS_PER_TILE"
-    rs=$(aws glue start-job-run --job-name "$JOB_NAME" --worker-type G.1X --number-of-workers "$workers" --arguments '{"--PROCESS_TYPE":"'$PROCESS_TYPE_DISCOVERY'",
+    rs=$(aws glue start-job-run --job-name "$JOB_NAME" --worker-type G.1X --number-of-workers "$workers" --region "$AWS_REGION" --arguments '{"--PROCESS_TYPE":"'$PROCESS_TYPE_DISCOVERY'",
         "--TILE":"0",
         "--TOTAL_TILES":"'$TILES'",
         "--S3_LANDING_ZONE":"'$S3_LANDING_ZONE'",
@@ -415,7 +417,7 @@ function Start_Replication {
     check_replication_runs $cnt
     if [ $? = 0 ]; then
       Delete_Stop_Event_R $cnt
-      rs=$(aws glue start-job-run --job-name "$JOB_NAME" --worker-type G.025X --number-of-workers "$workers" --arguments '{"--PROCESS_TYPE":"'$PROCESS_TYPE_REPLICATION'",
+      rs=$(aws glue start-job-run --job-name "$JOB_NAME" --worker-type G.025X --number-of-workers "$workers" --region "$AWS_REGION" --arguments '{"--PROCESS_TYPE":"'$PROCESS_TYPE_REPLICATION'",
           "--TILE":"'$cnt'",
           "--TOTAL_TILES":"'$TILES'",
           "--S3_LANDING_ZONE":"'$S3_LANDING_ZONE'",
@@ -438,7 +440,7 @@ function validate_json() {
   local json_str=$1
 
   # Check if the JSON is valid
-  echo "$json_str" | jq empty #> /dev/null 2>&1
+  echo "$json_str" | jq empty
   if [[ $? -ne 0 ]]; then
       log "ERROR: Invalid JSON"
       log '{"column": "column_name", "bucket": "bucket-name", "prefix": "keyspace_name/table_name/payload", "xref": "reference-column"}'
@@ -595,7 +597,7 @@ eval set -- "$PARAMS"
 
 if [[ $STATE == run ]]; then
   Start_Discovery
-  barrier
+  barrier "true"
   Start_Replication
   log "Started jobs:" "${JOBS[@]}"
 fi
@@ -632,7 +634,8 @@ if [[ $STATE == cleanup ]]; then
 fi
 
 if [[ $STATE == stats ]]; then
-  barrier
+  # the barrier without checking if the discovery job is running
+  barrier "false"
   tile=0
   while [ $tile -lt "$TILES" ]
     do
diff --git a/glue/sbin/CQLReplicator.scala b/glue/sbin/CQLReplicator.scala
@@ -21,7 +21,9 @@ import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.DataFrame
+
 import java.time.Duration
+import java.util.Optional
 
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
@@ -58,11 +60,16 @@ class CassandraTypeException(s: String) extends RuntimeException {}
 
 object GlueApp {
   def main(sysArgs: Array[String]) {
+
     def shuffleDf(df: DataFrame): DataFrame = {
       val encoder = RowEncoder(df.schema)
       df.mapPartitions(new scala.util.Random().shuffle(_))(encoder)
     }
 
+    def shuffleDfV2(df: DataFrame): DataFrame = {
+      df.orderBy(rand())
+    }
+
     def customConnectionFactory(sc: SparkContext): (CassandraConnector, CassandraConnector) = {
       val connectorToClusterSrc = CassandraConnector(sc.getConf.set("spark.cassandra.connection.config.profile.path", "KeyspacesConnector.conf"))
       val connectorToClusterTrg = CassandraConnector(sc.getConf.set("spark.cassandra.connection.config.profile.path", "CassandraConnector.conf"))
@@ -97,6 +104,50 @@ object GlueApp {
         }
     }
 
+    def preFlightCheck(connection: CassandraConnector, keyspace: String, table: String, dir: String): Unit = {
+      val logger = new GlueLogger
+      Try {
+        val c1 = Option(connection.openSession)
+        c1.isEmpty match {
+          case false => {
+            c1.get.getMetadata.getKeyspace(keyspace).isPresent match {
+              case true => {
+                c1.get.getMetadata.getKeyspace(keyspace).get.getTable(table).isPresent match {
+                  case true => {
+                    logger.info(s"the $dir table $table exists")
+                  }
+                  case false => {
+                    val err = s"ERROR: the $dir table $table does not exist"
+                    logger.error(err)
+                    sys.exit(-1)
+                  }
+                }
+              }
+              case false => {
+                val err = s"ERROR: the $dir keyspace $keyspace does not exist"
+                logger.error(err)
+                sys.exit(-1)
+              }
+            }
+          }
+          case _ => {
+            val err = s"ERROR: The job was not able to connecto to the $dir"
+            logger.error(err)
+            sys.exit(-1)
+          }
+        }
+      } match {
+        case Failure(_) => {
+          val err = s"ERROR: Detected connectivity issue. Check the reference conf file/Glue connection for the $dir, the job is aborted"
+          logger.error(err)
+          sys.exit(-1)
+        }
+        case Success(_) => {
+          logger.info(s"Connected to the $dir")
+        }
+      }
+    }
+
     val WAIT_TIME = 10000
     val MAX_RETRY_ATTEMPTS = 256
     // Unit ms
@@ -133,10 +184,21 @@ object GlueApp {
     val cassandraConn = customConnections._2
     val keyspacesConn = customConnections._1
     val landingZone = args("S3_LANDING_ZONE")
+    val bcktName = landingZone.replaceAll("s3://", "")
     val columnTs = args("WRITETIME_COLUMN")
     val source = s"sourceCluster.$srcKeyspaceName.$srcTableName"
     val ttlColumn = args("TTL_COLUMN")
     val olo = args("OFFLOAD_LARGE_OBJECTS")
+
+    //AmazonS3Client to check if a stop requested issued
+    val s3client = new AmazonS3Client()
+
+    // Let's do preflight checks
+    logger.info("Preflight check started")
+    preFlightCheck(cassandraConn, srcKeyspaceName, srcTableName, "source")
+    preFlightCheck(keyspacesConn, trgKeyspaceName, trgTableName, "target")
+    logger.info("Preflight check completed")
+
     val selectStmtWithTTL = ttlColumn match {
       case "None" => ""
       case _ => {
@@ -162,9 +224,6 @@ object GlueApp {
 
     val offloadLargeObjects = parseJSONConfig(offloadLageObjTmp)
 
-    //AmazonS3Client to check if a stop requested issued
-    val s3client = new AmazonS3Client()
-
     def stopRequested(bucket: String): Boolean = {
       val key = processType match {
         case "discovery" => s"$srcKeyspaceName/$srcTableName/$processType/stopRequested"
@@ -352,7 +411,7 @@ object GlueApp {
                 val tile = location._2
                 val ver = location._3
 
-                persistToTarget(shuffleDf(sourceDfV2), columns, columnsPos, tile, ver)
+                persistToTarget(shuffleDfV2(sourceDfV2), columns, columnsPos, tile, ver)
                 keyspacesConn.withSessionDo {
                   session => session.execute(s"INSERT INTO migration.ledger(ks,tbl,tile,ver,load_status,dt_load, offload_status) VALUES('$srcKeyspaceName','$srcTableName',$tile,'$ver','SUCCESS', toTimestamp(now()), '')")
                 }
@@ -401,6 +460,7 @@ object GlueApp {
       }
     }
 
+    // partitonBy instead of repartition
     def keysDiscoveryProcess() {
       val primaryKeysDf = sparkSession.read.option("inferSchema", "true").table(source)
       val primaryKeysDfwithTS = primaryKeysDf.selectExpr(pkFinal.map(c => c): _*)
@@ -471,7 +531,6 @@ object GlueApp {
       groupedPkDF.unpersist()
     }
 
-    val bcktName = landingZone.replaceAll("s3://", "")
     Iterator.continually(stopRequested(bcktName)).takeWhile(_ == false).foreach {
       _ => {
         processType match {