WikarNotAvailable
diff --git a/‎infrastructure/main.tf‎
Lines changed: 12 additions & 5 deletions b/‎infrastructure/main.tf‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎infrastructure/modules/glue_catalog/main.tf‎
Lines changed: 106 additions & 0 deletions b/‎infrastructure/modules/glue_catalog/main.tf‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎infrastructure/modules/glue_catalog/variables.tf‎
Lines changed: 5 additions & 0 deletions b/‎infrastructure/modules/glue_catalog/variables.tf‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎infrastructure/modules/glue_jobs/main.tf‎
Lines changed: 116 additions & 10 deletions b/‎infrastructure/modules/glue_jobs/main.tf‎
Lines changed: 116 additions & 10 deletions
diff --git a/‎infrastructure/modules/glue_jobs/outputs.tf‎
Lines changed: 25 additions & 0 deletions b/‎infrastructure/modules/glue_jobs/outputs.tf‎
Lines changed: 25 additions & 0 deletions
@@ -3,8 +3,9 @@ module "s3_buckets" {
 }
 
 module "iam" {
-  source         = "./modules/iam"
-  glue_role_name = var.glue_role_name
+  source                        = "./modules/iam"
+  glue_role_name                = var.glue_role_name
+  sagemaker_execution_role_name = var.sagemaker_execution_role_name
 }
 
 module "glue_catalog" {
@@ -14,22 +15,23 @@ module "glue_catalog" {
   bdp_wallets_aggregations_bucket = module.s3_buckets.bdp_wallets_aggregations_bucket
   bdp_scaled_features_bucket      = module.s3_buckets.bdp_scaled_features_bucket
   bdp_unscaled_features_bucket    = module.s3_buckets.bdp_unscaled_features_bucket
+  bdp_anomaly_detection_bucket    = module.s3_buckets.bdp_anomaly_detection_bucket
 }
 
 module "iam_github_role" {
   source             = "./modules/iam_github_role"
   github_role_name   = var.github_role_name
-  glue_script_bucket = module.s3_buckets.glue_scripts_bucket
+  glue_script_bucket = module.s3_buckets.bdp_glue_scripts_bucket
 }
 
 module "iam_github_user" {
   source             = "./modules/iam_github_user"
-  glue_script_bucket = module.s3_buckets.glue_scripts_bucket
+  glue_script_bucket = module.s3_buckets.bdp_glue_scripts_bucket
 }
 
 module "glue_jobs" {
   source             = "./modules/glue_jobs"
-  glue_script_bucket = module.s3_buckets.glue_scripts_bucket
+  glue_script_bucket = module.s3_buckets.bdp_glue_scripts_bucket
   glue_role_arn      = module.iam.glue_role_arn
   default_arguments  = var.glue_jobs_default_arguments
 }
@@ -40,4 +42,9 @@ module "glue_workflows" {
   transactions_cleaning_job_name = module.glue_jobs.transactions_cleaning_job_name
   wallets_aggregations_job_name  = module.glue_jobs.wallets_aggregations_job_name
   feature_scaling_job_name       = module.glue_jobs.feature_scaling_job_name
+}
+
+module "sagemaker_notebooks" {
+  source                       = "./modules/sagemaker_notebooks"
+  sagemaker_execution_role_arn = module.iam.sagemaker_execution_role_arn
 }
@@ -17,6 +17,10 @@ resource "aws_glue_catalog_database" "bdp_db" {
     name = "network_name"
     type = "string"
   }
+   partition_keys {
+    name = "day(block_timestamp)"
+    type = "timestamp"
+  }
 
   parameters = {
     "write.format.default"            = "parquet"
@@ -137,6 +141,10 @@ resource "aws_glue_catalog_table_optimizer" "cleaned_transactions_compaction_opt
     name = "network_name"
     type = "string"
   }
+  partition_keys {
+    name = "day(last_transaction_timestamp)"
+    type = "timestamp"
+  }
 
   parameters = {
     "write.format.default"            = "parquet",
@@ -1014,6 +1022,104 @@ resource "aws_glue_catalog_table_optimizer" "unscaled_features_compaction_optimi
   table_name    = "unscaled_features"
   type          = "compaction"
 
+  configuration {
+    role_arn = var.glue_role_arn
+    enabled  = true
+  }
+}
+
+/*resource "aws_glue_catalog_table" "anomaly_detection" {
+  database_name = aws_glue_catalog_database.bdp_db.name
+  name          = "anomaly_detection"
+  table_type    = "EXTERNAL_TABLE"
+
+  open_table_format_input {
+    iceberg_input {
+      metadata_operation = "CREATE"
+    }
+  }
+  //Commented because https://github.com/hashicorp/terraform-provider-aws/issues/36531
+  partition_keys {
+    name = "network_name"
+    type = "string"
+  }
+
+  partition_keys {
+    name = "day(block_timestamp_unscaled)"
+    type = "timestamp"
+  }
+
+  parameters = {
+    "write.format.default"            = "parquet"
+    "write.parquet.compression-codec" = "zstd"
+  }
+
+  storage_descriptor {
+    location      = "s3://${var.bdp_anomaly_detection_bucket}"
+    input_format  = "org.apache.hadoop.mapred.FileInputFormat"
+    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
+    compressed    = true
+
+    ser_de_info {
+      name                  = "anomaly_detection_serde"
+      serialization_library = "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
+    }
+
+
+    columns {
+      name = "transaction_hash"
+      type = "string"
+    }
+    columns {
+      name = "sender_address"
+      type = "string"
+    }
+    columns {
+      name = "receiver_address"
+      type = "string"
+    }
+    columns {
+      name = "block_timestamp_unscaled"
+      type = "timestamp"
+    }
+    columns {
+      name = "network_name"
+      type = "string"
+    }
+    columns {
+      name = "is_anomaly"
+      type = "boolean"
+    }
+    
+    }
+  }
+}*/
+
+resource "aws_glue_catalog_table_optimizer" "anomaly_detection_orphan_files_deletion_optimizer" {
+  catalog_id    = "982534349340"
+  database_name = aws_glue_catalog_database.bdp_db.name
+  table_name    = "anomaly_detection"
+  type          = "orphan_file_deletion"
+
+  configuration {
+    role_arn = var.glue_role_arn
+    enabled  = true
+
+    orphan_file_deletion_configuration {
+      iceberg_configuration {
+        orphan_file_retention_period_in_days = 2
+        location                             = "s3://${var.bdp_anomaly_detection_bucket}"
+      }
+    }
+  }
+}
+
+resource "aws_glue_catalog_table_optimizer" "anomaly_detection_compaction_optimizer" {
+  catalog_id    = "982534349340"
+  database_name = aws_glue_catalog_database.bdp_db.name
+  table_name    = "anomaly_detection"
+  type          = "compaction"
+
   configuration {
     role_arn = var.glue_role_arn
     enabled  = true
 
@@ -18,6 +18,11 @@ variable "bdp_unscaled_features_bucket" {
   description = "Unscaled features bucket name"
 }
 
+variable "bdp_anomaly_detection_bucket" {
+  type        = string
+  description = "Anomaly detection bucket name"
+}
+
 variable "glue_role_arn" {
   type        = string
   description = "ARN of IAM role for Glue"
 
@@ -1,14 +1,24 @@
 locals {
   transactions_cleaning_arguments = {
-    "--END_DATE"       = "2024-11-30"
-    "--START_DATE"     = "2024-11-1"
+    "--END_DATE"       = "2024-12-31"
+    "--START_DATE"     = "2024-10-1"
     "--NETWORK_PREFIX" = "all"
   }
 
-  transactions_cleaning_final_arguments = merge(
-    var.default_arguments,
-    local.transactions_cleaning_arguments
-  )
+  iceberg_argument = {
+    "--datalake-formats" = "iceberg"
+  }
+
+  converting_to_recordio_arguments = {
+    "--extra-jars"                      = "s3://bdp-glue-scripts/sagemaker-spark_2.12-spark_3.3.0-1.4.6.dev0.jar"
+    "--python-modules-installer-option" = "-r"
+    "--additional-python-modules"       = "s3://bdp-glue-scripts/requirements.txt"
+  }
+
+  anomaly_classification_arguments = {
+    "--QUANTILE" = 0.99
+  }
+
 }
 
 resource "aws_glue_job" "transactions_cleaning" {
@@ -20,10 +30,10 @@ resource "aws_glue_job" "transactions_cleaning" {
     python_version  = "3"
   }
 
-  worker_type       = "G.1X"
+  worker_type       = "G.2X"
   number_of_workers = 10
   glue_version      = "5.0"
-  default_arguments = local.transactions_cleaning_final_arguments
+  default_arguments = merge(var.default_arguments, local.iceberg_argument, local.transactions_cleaning_arguments)
   timeout           = 120
 }
 
@@ -37,10 +47,10 @@ resource "aws_glue_job" "wallets_aggregations" {
     python_version  = "3"
   }
 
-  worker_type       = "G.1X"
+  worker_type       = "G.2X"
   number_of_workers = 10
   glue_version      = "5.0"
-  default_arguments = var.default_arguments
+  default_arguments = merge(var.default_arguments, local.iceberg_argument)
   timeout           = 120
 }
 
@@ -53,9 +63,105 @@ resource "aws_glue_job" "feature_scaling" {
     python_version  = "3"
   }
 
+  worker_type       = "G.2X"
+  number_of_workers = 10
+  glue_version      = "5.0"
+  default_arguments = merge(var.default_arguments, local.iceberg_argument)
+  timeout           = 120
+}
+
+resource "aws_glue_job" "spearman_feature_selection" {
+  name     = "Spearman feature selection"
+  role_arn = var.glue_role_arn
+  command {
+    name            = "glueetl"
+    script_location = "s3://${var.glue_script_bucket}/spearman.py"
+    python_version  = "3"
+  }
+
+  worker_type       = "G.2X"
+  number_of_workers = 10
+  glue_version      = "5.0"
+  default_arguments = var.default_arguments
+  timeout           = 300
+}
+
+resource "aws_glue_job" "convert_parquet_to_csv" {
+  name     = "Convert parquet to CSV"
+  role_arn = var.glue_role_arn
+  command {
+    name            = "glueetl"
+    script_location = "s3://${var.glue_script_bucket}/convert_features_to_csv.py"
+    python_version  = "3"
+  }
+
+  worker_type       = "G.1X"
+  number_of_workers = 10
+  glue_version      = "5.0"
+  default_arguments = var.default_arguments
+  timeout           = 480
+}
+
+resource "aws_glue_job" "convert_features_to_recordio" {
+  name     = "Convert features to recordio"
+  role_arn = var.glue_role_arn
+  command {
+    name            = "glueetl"
+    script_location = "s3://${var.glue_script_bucket}/convert_features_to_recordio.py"
+    python_version  = "3"
+  }
+
+  worker_type       = "G.2X"
+  number_of_workers = 10
+  glue_version      = "5.0"
+  default_arguments = merge(var.default_arguments, local.converting_to_recordio_arguments)
+  timeout           = 180
+}
+
+resource "aws_glue_job" "preprocessing_with_string_columns" {
+  name     = "Preprocesssing with string columns"
+  role_arn = var.glue_role_arn
+  command {
+    name            = "glueetl"
+    script_location = "s3://${var.glue_script_bucket}/preprocessing_for_inference.py"
+    python_version  = "3"
+  }
+
   worker_type       = "G.1X"
   number_of_workers = 10
   glue_version      = "5.0"
   default_arguments = var.default_arguments
   timeout           = 120
+}
+
+resource "aws_glue_job" "convert_parquet_to_csv_for_visualisation" {
+  name     = "Convert parquet to csv for visualization"
+  role_arn = var.glue_role_arn
+  command {
+    name            = "glueetl"
+    script_location = "s3://${var.glue_script_bucket}/convert_features_to_csv_inference.py"
+    python_version  = "3"
+  }
+
+  worker_type       = "G.1X"
+  number_of_workers = 10
+  glue_version      = "5.0"
+  default_arguments = var.default_arguments
+  timeout           = 120
+}
+
+resource "aws_glue_job" "anomaly_classification" {
+  name     = "Anomaly Classification"
+  role_arn = var.glue_role_arn
+  command {
+    name            = "glueetl"
+    script_location = "s3://${var.glue_script_bucket}/detect_anomaly.py"
+    python_version  = "3"
+  }
+
+  worker_type       = "G.1X"
+  number_of_workers = 10
+  glue_version      = "5.0"
+  default_arguments = merge(var.default_arguments, local.anomaly_classification_arguments)
+  timeout           = 120
 }
@@ -11,4 +11,29 @@ output "wallets_aggregations_job_name" {
 output "feature_scaling_job_name" {
   value       = aws_glue_job.feature_scaling.name
   description = "Name of the feature_scaling Glue job"
+}
+
+output "spearman_feature_selection_job_name" {
+  value       = aws_glue_job.spearman_feature_selection.name
+  description = "Name of the spearman_feature_selection Glue job"
+}
+
+output "convert_parquet_to_csv_job_name" {
+  value       = aws_glue_job.convert_parquet_to_csv.name
+  description = "Name of the convert_parquet_to_csv Glue job"
+}
+
+output "convert_features_to_recordio_job_name" {
+  value       = aws_glue_job.convert_features_to_recordio.name
+  description = "Name of the convert_features_to_recordio Glue job"
+}
+
+output "preprocessing_with_string_columns_job_name" {
+  value       = aws_glue_job.preprocessing_with_string_columns.name
+  description = "Name of the preprocessing_with_string_columns Glue job"
+}
+
+output "convert_parquet_to_csv_for_visualisation_job_name" {
+  value       = aws_glue_job.convert_parquet_to_csv_for_visualisation.name
+  description = "Name of the convert_parquet_to_csv_for_visualisation Glue job"
 }