Cloudformation EMR 템플릿 예시
2020-05-30
.
Data_Engineering_TIL(20200530)
[개요]
클라우드 포메이션을 이용한 EMR 템플릿 예시
[템플릿 예시]
{
"AWSTemplateFormatVersion": "2010-09-09",
"Resources": {
"PMSEMRInstance": {
"Type": "AWS::EMR::Cluster",
"Properties": {
"Applications": [
{
"Name": "Hadoop"
},
{
"Name": "Hive"
},
{
"Name": "Hue"
},
{
"Name": "Spark"
},
{
"Name": "Zeppelin"
},
{
"Name": "Ganglia"
},
{
"Name": "Tez"
},
{
"Name": "Oozie"
},
{
"Name": "Presto"
}
],
"BootstrapActions": [{
"Name": "PMSBootstrap",
"ScriptBootstrapAction": {
"Path": "s3://pms-example-bucket/bootstrap-example.sh"
}
}],
"Configurations": [{
"Classification": "yarn-site",
"ConfigurationProperties": {
"yarn.node-labels.enabled": "true",
"yarn.node-labels.am.default-node-label-expression": "CORE",
"yarn.log-aggregation.retain-seconds": "432000",
"yarn.scheduler.minimum-allocation-mb": "5856",
"yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler",
"yarn.scheduler.fair.user-as-default-queue": "false",
"yarn.scheduler.fair.preemption": "true"
}
},
{
"Classification": "mapred-site",
"ConfigurationProperties": {
"mapreduce.tasktracker.map.tasks.maximum": "4",
"mapreduce.tasktracker.reduce.tasks.maximum": "4",
"mapreduce.tasktracker.http.threads": "400"
}
},
{
"Classification": "hive-site",
"ConfigurationProperties": {
"hive.exec.dynamic.partition.mode": "nonstrict",
"hive.exec.max.dynamic.partitions": "21000",
"hive.exec.max.dynamic.partitions.pernode": "70000",
"hive.scratch.dir.permission": "777",
"hive.server2.in.place.progress": "false",
"hive.vectorized.execution.enabled": "true",
"hive.vectorized.execution.reduce.enabled": "true",
"hive.load.dynamic.partitions.thread": "25",
"hive.metastore.warehouse.dir": "s3://pms-example-bucket",
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
"hive.metastore.schema.verification": "false",
"javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
"javax.jdo.option.ConnectionURL": "jdbc:mysql://[RDS_Endpoint]:3306/hive?createDatabaseIfNotExist=true&autoReconnect=true",
"javax.jdo.option.ConnectionUserName": "user",
"javax.jdo.option.ConnectionPassword": "xxxxxxxxxxxxx",
"hive.merge.sparkfiles": "true",
"hive.merge.mapfiles": "true",
"hive.merge.mapredfiles": "true",
"hive.merge.tezfiles": "true",
"hive.merge.smallfiles.avgsize": "134217728"
}
},
{
"Classification": "presto-connector-hive",
"ConfigurationProperties": {
"hive.metastore": "glue"
}
},
{
"Classification": "oozie-site",
"ConfigurationProperties": {
"oozie.service.JPAService.jdbc.password": "qwer1234",
"oozie.service.JPAService.jdbc.url": "jdbc:mysql://[RDS_Endpoint example]:3306/${oozie.db.schema.name}",
"oozie.service.JPAService.validate.db.connection": "true",
"oozie.service.JPAService.create.db.schema": "true",
"oozie.service.JPAService.jdbc.driver": "org.mariadb.jdbc.Driver",
"oozie.service.JPAService.jdbc.username": "oozie",
"oozie.db.schema.name": "oozie",
"oozie.service.JPAService.pool.max.active.conn": "100",
"oozie.processing.timezone": "GMT+0900",
"oozie.action.max.output.data": "102400"
}
},
{
"Classification": "hue-ini",
"Configurations": [{
"Classification": "desktop",
"ConfigurationProperties": {
"time_zone": "Asia/Seoul"
},
"Configurations": [{
"Classification": "database",
"ConfigurationProperties": {
"name": "hue",
"user": "user",
"password": "xxxxxxxxxx",
"host": "[RDS_Endpoint_example]",
"port": "3306",
"engine": "mysql"
},
"Configurations": []
}]
}]
},
{
"Classification": "spark-defaults",
"ConfigurationProperties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
"spark.eventLog.enabled": "false",
"spark.history.fs.cleaner.enabled": "true",
"spark.history.fs.cleaner.interval": "3h",
"spark.history.fs.cleaner.maxAge": "1h"
}
}
],
"Instances": {
"MasterInstanceFleet": {
"Name": "pms-master-fleet",
"TargetOnDemandCapacity": 1,
"InstanceTypeConfigs": [{
"EbsConfiguration": {
"EbsBlockDeviceConfigs": [{
"VolumeSpecification": {
"SizeInGB": 100,
"VolumeType": "gp2"
},
"VolumesPerInstance": 1
}],
"EbsOptimized": true
},
"InstanceType": "r5.xlarge"
}]
},
"CoreInstanceFleet": {
"Name": "pms-core-fleet",
"TargetSpotCapacity": 1,
"InstanceTypeConfigs": [{
"EbsConfiguration": {
"EbsBlockDeviceConfigs": [{
"VolumeSpecification": {
"SizeInGB": 100,
"VolumeType": "gp2"
},
"VolumesPerInstance": 1
}],
"EbsOptimized": true
},
"InstanceType": "r5.xlarge"
}]
},
"Ec2KeyName": "pms-ec2-keypair",
"Ec2SubnetId": "subnet-xxxxxxxxxxxxxxxxxx",
"EmrManagedMasterSecurityGroup": "sg-xxxxxxxxxxxxxxxxxx",
"EmrManagedSlaveSecurityGroup": "sg-xxxxxxxxxxxxxxxxxx",
"ServiceAccessSecurityGroup": "sg-xxxxxxxxxxxxxxxxxxxx",
"HadoopVersion": "2.8.5",
"TerminationProtected": false
},
"JobFlowRole": "pms-ec2-default-role",
"LogUri": "s3://pms-test-bucket/",
"Name": "pms-EMR",
"ReleaseLabel": "emr-5.28.1",
"ScaleDownBehavior": "TERMINATE_AT_TASK_COMPLETION",
"ServiceRole": "pms-default-role",
"Tags": [{
"Key": "Name",
"Value": "pms-EMR"
}],
"VisibleToAllUsers": "true"
}
},
"FleetTaskInstance": {
"Type": "AWS::EMR::InstanceFleetConfig",
"Properties": {
"ClusterId": {
"Ref": "PMSEMRInstance"
},
"Name": "pms-task-fleet",
"InstanceFleetType": "TASK",
"TargetSpotCapacity" : 1,
"LaunchSpecifications": {
"SpotSpecification": {
"TimeoutDurationMinutes": 5,
"TimeoutAction": "SWITCH_TO_ON_DEMAND"
}
},
"InstanceTypeConfigs": [{
"InstanceType": "r5.xlarge",
"EbsConfiguration": {
"EbsBlockDeviceConfigs": [{
"VolumeSpecification": {
"SizeInGB": 100,
"VolumeType": "gp2"
},
"VolumesPerInstance": 1
}],
"EbsOptimized": true
}
}]
}
},
"STEPSTATEPUSHER": {
"Type": "AWS::EMR::Step",
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"state-pusher-script"
],
"Jar": "command-runner.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Setup hadoop debugging"
}
},
"STEPROOTVOLUME": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPSTATEPUSHER"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket/step/script",
"example.sh",
"."
],
"Jar": "s3://pms-bucket/example.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Root Volume Up"
}
},
"STEPCRON": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPROOTVOLUME"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket/script",
"example2.sh",
"."
],
"Jar": "s3://pms-bucket/library/example2.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Setting Crontab for hdfs tmp clean"
}
},
"STEPHUE": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPCRON"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket/",
"examlple3.sh",
"."
],
"Jar": "s3://pmb-bucket/library/example4.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Setting Hue"
}
},
"STEPOOZIE": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPHUE"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket/script",
"example5.sh",
"."
],
"Jar": "s3://pms-bucket/library/example6.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Setup oozie library"
}
},
"STEPSCHEDULER": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPOOZIE"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucketp/script",
"example7.sh",
"."
],
"Jar": "s3://pms-bucket-test/library/example8.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Install fair scheduler "
}
},
"STEPSPARK": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPSCHEDULER"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket-test/script",
"exmaple9.sh",
"."
],
"Jar": "s3://pms-bucket-test/example10.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Setting Spark"
}
},
"STEPZEPPELIN": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPSPARK"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket-test/script",
"example11.sh",
"."
],
"Jar": "s3://pms-bucket-test/example12.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Install zeppelin"
}
},
"STEPHADOOP": {
"Type": "AWS::EMR::Step",
"DependsOn": [
"STEPZEPPELIN"
],
"Properties": {
"ActionOnFailure": "CONTINUE",
"HadoopJarStep": {
"Args": [
"s3://pms-bucket-test/script",
"example14.sh",
"."
],
"Jar": "s3://pms-bucket-test/example13.jar"
},
"JobFlowId": {
"Ref": "PMSEMRInstance"
},
"Name": "Setting Hadoop"
}
}
}
}