AWS EMR Cluster Automation Using Concourse & Terraform
Why Terraform?
Terraform - High Level Workflow
Automation Flow Diagram
What code is doing?
What more could be done?
I have provided 3 Terraform files here:
1> main.tf : This has the actual codes to create the EMR infrastructure.
Recommended by LinkedIn
provider "aws" {
region = "us-east-1"
}
terraform {
backend "s3" {
bucket = "emr-automation"
key = "output/emrstate.tfstate"
region = "us-east-1"
}
}
resource "aws_emr_cluster" "cluster" {
name = "${var.name}"
release_label = "${var.release_label}"
applications = "${var.applications}"
termination_protection = false
autoscaling_role ="${var.autoscaling_role}"
configurations_json = file(var.configurations_json)
log_uri = "${var.log_uri}"
service_role = "${var.service_role}"
/*
dynamic "step" {
for_each = jsondecode(templatefile("${var.steps}", {}))
content {
action_on_failure = step.value.action_on_failure
name = step.value.name
hadoop_jar_step {
jar = step.value.hadoop_jar_step.jar
args = step.value.hadoop_jar_step.args
}
}
}*/
step_concurrency_level = "${var.step_concurrency_level}"
ec2_attributes {
key_name = "${var.key_name}"
subnet_id = "${var.subnet_id}"
emr_managed_master_security_group = "${var.emr_managed_master_security_group}"
emr_managed_slave_security_group = "${var.emr_managed_slave_security_group}"
service_access_security_group = "${var.service_access_security_group}"
instance_profile = "${var.instance_profile}"
}
master_instance_group {
name = "${var.master_instance_group_name}"
instance_type = "${var.master_instance_group_instance_type}"
instance_count = "${var.master_instance_group_instance_count}"
bid_price = "${var.master_instance_group_bid_price}"
ebs_config {
iops = "${var.master_instance_group_ebs_iops}"
size = "${var.master_instance_group_ebs_size}"
type = "${var.master_instance_group_ebs_type}"
volumes_per_instance = "${var.master_instance_group_ebs_volumes_per_instance}"
}
}
core_instance_group {
name = "${var.core_instance_group_name}"
instance_type = "${var.core_instance_group_instance_type}"
instance_count = "${var.core_instance_group_instance_count}"
bid_price = "${var.core_instance_group_bid_price}" #Do not use core instances as Spot Instance in Prod because terminating a core instance risks data loss.
ebs_config {
iops = "${var.core_instance_group_ebs_iops}"
size = "${var.core_instance_group_ebs_size}"
type = "${var.core_instance_group_ebs_type}"
volumes_per_instance = "${var.core_instance_group_ebs_volumes_per_instance}"
}
autoscaling_policy = file("${var.core_instance_group_autoscaling_policy}")
}
tags = {
Name = "${var.name}"
Project = "${var.project}"
Environment = "${var.environment}"
}
}
resource "aws_emr_instance_group" "task_instance_group" {
name = "${var.task_instance_group_name}"
cluster_id = join("", aws_emr_cluster.cluster.*.id)
instance_type = "${var.task_instance_group_instance_type}"
instance_count = "${var.task_instance_group_instance_count}"
bid_price = "${var.task_instance_group_bid_price}" #Spot Instances are preferred in Prod
configurations_json = file(var.configurations_json)
autoscaling_policy = file("${var.task_instance_group_autoscaling_policy}")
ebs_config {
iops = "${var.task_instance_group_ebs_iops}"
size = "${var.task_instance_group_ebs_size}"
type = "${var.task_instance_group_ebs_type}"
volumes_per_instance = "${var.task_instance_group_ebs_volumes_per_instance}"
}
}
2> variables.tf: This file contains all variables & values
variable "project" {
default = "emr-automation"
}
variable "environment" {
description = "Dev/Prod/Stage"
default = "dev"
}
variable "name" {
description = "Name of the EMR cluster to be created"
default = "Terraform-Automation"
}
variable "step_concurrency_level" {
default = 1
}
variable "release_label" {
description = "EMR Version"
default = "emr-6.2.0"
}
variable "autoscaling_role" {
default = "arn:aws:iam::xxxxxx:role/xxxx/EMR_AutoScaling_DefaultRole"
}
variable "applications" {
type = list(string)
description = "Name the applications to be installed"
default = [ "Hadoop",
"Hive",
"Hue",
"JupyterHub",
"Pig",
"Presto",
"Spark"]
}
#------------------------------Master Instance Group------------------------------variable "master_instance_group_name" {
type = string
description = "Name of the Master instance group"
default = "MasterGroup"
}
variable "master_instance_group_instance_type" {
type = string
description = "EC2 instance type for all instances in the Master instance group"
default = "m5.xlarge"
}
variable "master_instance_group_instance_count" {
type = number
description = "Target number of instances for the Master instance group. Must be at least 1"
default = 1
}
variable "master_instance_group_ebs_size" {
type = number
description = "Master instances volume size, in gibibytes (GiB)"
default = 30
}
variable "master_instance_group_ebs_type" {
type = string
description = "Master instances volume type. Valid options are `gp2`, `io1`, `standard` and `st1`"
default = "gp2"
}
variable "master_instance_group_ebs_iops" {
type = number
description = "The number of I/O operations per second (IOPS) that the Master volume supports"
default = null
}
variable "master_instance_group_ebs_volumes_per_instance" {
type = number
description = "The number of EBS volumes with this configuration to attach to each EC2 instance in the Master instance group"
default = 1
}
variable "master_instance_group_bid_price" {
type = string
description = "Bid price for each EC2 instance in the Master instance group, expressed in USD. By setting this attribute, the instance group is being declared as a Spot Instance, and will implicitly create a Spot request. Leave this blank to use On-Demand Instances"
default = 0.3
}
#----------------------Core Instance Group-----------------------------------#
variable "core_instance_group_name" {
type = string
description = "Name of the Master instance group"
default = "CoreGroup"
}
variable "core_instance_group_instance_type" {
type = string
description = "EC2 instance type for all instances in the Core instance group"
default = "m5.xlarge"
}
variable "core_instance_group_instance_count" {
type = number
description = "Target number of instances for the Core instance group. Must be at least 1"
default = 1
}
variable "core_instance_group_ebs_size" {
type = number
description = "Core instances volume size, in gibibytes (GiB)"
default = 30
}
variable "core_instance_group_ebs_type" {
type = string
description = "Core instances volume type. Valid options are `gp2`, `io1`, `standard` and `st1`"
default = "gp2"
}
variable "core_instance_group_ebs_iops" {
type = number
description = "The number of I/O operations per second (IOPS) that the Core volume supports"
default = null
}
variable "core_instance_group_ebs_volumes_per_instance" {
type = number
description = "The number of EBS volumes with this configuration to attach to each EC2 instance in the Core instance group"
default = 1
}
variable "core_instance_group_bid_price" {
type = string
description = "Bid price for each EC2 instance in the Core instance group, expressed in USD. By setting this attribute, the instance group is being declared as a Spot Instance, and will implicitly create a Spot request. Leave this blank to use On-Demand Instances"
default = 0.3
}
variable "core_instance_group_autoscaling_policy" {
type = string
description = "String containing the EMR Auto Scaling Policy JSON for the Core instance group"
default = "./additional_files/core_instance_group-autoscaling_policy.json.tpl"
}
#-----------------Task Instance Group----------------
variable "task_instance_group_name" {
type = string
description = "Name of the Master instance group"
default = "taskGroup"
}
variable "task_instance_group_instance_type" {
type = string
description = "EC2 instance type for all instances in the task instance group"
default = "m5.xlarge"
}
variable "task_instance_group_instance_count" {
type = number
description = "Target number of instances for the task instance group. Must be at least 1"
default = 1
}
variable "task_instance_group_ebs_size" {
type = number
description = "task instances volume size, in gibibytes (GiB)"
default = 30
}
variable "task_instance_group_ebs_type" {
type = string
description = "task instances volume type. Valid options are `gp2`, `io1`, `standard` and `st1`"
default = "gp2"
}
variable "task_instance_group_ebs_iops" {
type = number
description = "The number of I/O operations per second (IOPS) that the task volume supports"
default = null
}
variable "task_instance_group_ebs_volumes_per_instance" {
type = number
description = "The number of EBS volumes with this configuration to attach to each EC2 instance in the task instance group"
default = 1
}
variable "task_instance_group_bid_price" {
type = string
description = "Bid price for each EC2 instance in the task instance group, expressed in USD. By setting this attribute, the instance group is being declared as a Spot Instance, and will implicitly create a Spot request. Leave this blank to use On-Demand Instances"
default = 0.3
}
variable "task_instance_group_autoscaling_policy" {
type = string
description = "String containing the EMR Auto Scaling Policy JSON for the task instance group"
default = "./additional_files/task_instance_group-autoscaling_policy.json.tpl"
}
#-------------------------------------------------------------------------------#
variable "key_name" {
default = "key-pair-name"
}
variable "subnet_id" {
default = "subnet-xxxx"
}
variable "instance_profile" {
default = "EMR_EC2_DefaultRole"
}
variable "service_access_security_group"{
default = "sg-xxxx"
}
variable "emr_managed_master_security_group" {
default = "sg-xxxx"
}
variable "emr_managed_slave_security_group" {
default = "sg-xxxxx"
}
variable "service_role" {
default = "arn:aws:iam::xxxxx:role/xxxx/EMR_DefaultRole"
}
variable "configurations_json" {
type = string
description = "A JSON string for supplying list of configurations for the EMR cluster"
default = "./additional_files/configuration.json.tpl"
}
variable "log_uri" {
default = "s3://emr-automation/emr-logs/"
}
variable "steps" {
type = string
description = "Steps to execute after creation of EMR"
default = "./additional_files/steps.json.tpl"
}
Apart from these three files, I am using few more json template files for the config purpose which are being used in the variables.tf file. If you want to execute this Terraform script then you must place these json files in a additional_files folder.
1> configuration.json.tpl : This file contains the basic configurations for EMR like JAVA HOME, Hive Metastore, Catalog etc. Choose the values as per your environment & requirements.
[
{
"classification":"emrfs-site",
"properties":{"fs.s3.consistent.retryPeriodSeconds":"10", "fs.s3.consistent":"true", "fs.s3.consistent.retryCount":"5", "fs.s3.consistent.metadata.tableName":"EmrFSMetadata"}, "configurations":[]
},
{
"classification":"spark",
"properties":{"maximizeResourceAllocation":"true"},
"configurations":[]
},
{
"classification":"hive-site",
"properties":{"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"},
"configurations":[]
},
{
"classification":"presto-connector-hive",
"properties":{"hive.metastore.glue.datacatalog.enabled":"true"},
"configurations":[]
},
{
"classification":"spark-hive-site",
"properties":{"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"},
"configurations":[]
},
{
"Classification": "hadoop-env",
"Configurations": [
{
"Classification": "export",
"Configurations": [],
"Properties": {"JAVA_HOME": "/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/"}
}
],
"Properties": {}
},
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Configurations": [],
"Properties": {"JAVA_HOME": "/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/"}
}
],
"Properties": {}
}
]
2> steps.json.tpl: This file contains the path of your scripts to be executed in EMR. It is like Spark-Submit job. You should also include your custom jars(if any). Click here for more details about Steps.
{
"action_on_failure" : "CONTINUE",
"hadoop_jar_step" :
{
"jar" : "command-runner.jar",
"args" : [
"spark-submit",
"s3://emr-automation/code/required-jars.jar"
]
},
"name" : "Custom Jars"
}
]
3> task_instance_group-autoscaling_policy.json.tpl: This file contains the auto-scaling (Out & In) policies for EMR task instance group. You can include more Cloudwatch metrics for auto-scaling trigger. Check here for AWS Auto-Scaling(Auto-Scaling).
{
"Constraints": {
"MinCapacity": 1,
"MaxCapacity": 3
},
"Rules": [
{
"Name": "ScaleOutMemoryPercentage",
"Description": "Scale out if YARNMemoryAvailablePercentage is less than 15",
"Action": {
"SimpleScalingPolicyConfiguration": {
"AdjustmentType": "CHANGE_IN_CAPACITY",
"ScalingAdjustment": 1,
"CoolDown": 100
}
},
"Trigger": {
"CloudWatchAlarmDefinition": {
"ComparisonOperator": "LESS_THAN",
"EvaluationPeriods": 1,
"MetricName": "YARNMemoryAvailablePercentage",
"Namespace": "AWS/ElasticMapReduce",
"Period": 300,
"Statistic": "AVERAGE",
"Threshold": 15.0,
"Unit": "PERCENT"
}
}
}
,
{
"Name": "ScaleInMemoryPercentage",
"Description": "Scale in if YARNMemoryAvailablePercentage is greter than 50",
"Action": {
"SimpleScalingPolicyConfiguration": {
"AdjustmentType": "CHANGE_IN_CAPACITY",
"ScalingAdjustment": -1,
"CoolDown": 300
}
},
"Trigger": {
"CloudWatchAlarmDefinition": {
"ComparisonOperator": "GREATER_THAN_OR_EQUAL",
"EvaluationPeriods": 1,
"MetricName": "YARNMemoryAvailablePercentage",
"Namespace": "AWS/ElasticMapReduce",
"Period": 600,
"Statistic": "AVERAGE",
"Threshold": 60.0,
"Unit": "PERCENT"
}
}
}
]
}
4> core_instance_group-autoscaling_policy.json.tpl: This file is the same as above. This file is used for Core Instance Group.
{
"Constraints": {
"MinCapacity": 1,
"MaxCapacity": 3
},
"Rules": [
{
"Name": "ScaleOutMemoryPercentage",
"Description": "Scale out if YARNMemoryAvailablePercentage is less than 15",
"Action": {
"SimpleScalingPolicyConfiguration": {
"AdjustmentType": "CHANGE_IN_CAPACITY",
"ScalingAdjustment": 1,
"CoolDown": 100
}
},
"Trigger": {
"CloudWatchAlarmDefinition": {
"ComparisonOperator": "LESS_THAN",
"EvaluationPeriods": 1,
"MetricName": "YARNMemoryAvailablePercentage",
"Namespace": "AWS/ElasticMapReduce",
"Period": 300,
"Statistic": "AVERAGE",
"Threshold": 15.0,
"Unit": "PERCENT"
}
}
}
,
{
"Name": "ScaleInMemoryPercentage",
"Description": "Scale in if YARNMemoryAvailablePercentage is greter than 50",
"Action": {
"SimpleScalingPolicyConfiguration": {
"AdjustmentType": "CHANGE_IN_CAPACITY",
"ScalingAdjustment": -1,
"CoolDown": 300
}
},
"Trigger": {
"CloudWatchAlarmDefinition": {
"ComparisonOperator": "GREATER_THAN_OR_EQUAL",
"EvaluationPeriods": 1,
"MetricName": "YARNMemoryAvailablePercentage",
"Namespace": "AWS/ElasticMapReduce",
"Period": 600,
"Statistic": "AVERAGE",
"Threshold": 60.0,
"Unit": "PERCENT"
}
}
}
]
}
This is the basic implementation how to create EMR using Terraform. You can further execute this Terraform code through Concourse CI/CD Pipeline based on the schedule.
Tags: #DevOps #CI/CD #Terraform #Infra-as-Code #IaC #AWSEMR #CloudAutomation #POC #InnovativeIdea #CloudInsight #AWSCloudArchitect #AWSSolutionArchitect #Concourse #GoCloudArchitect
Senior DevOps consultant| AWS SAA Certified ☁️ | CI/CD|GIT | 🐳| ☸️|Linux|Jenkins|EKS|Terraform
1yHi