Cloudformation EMR 템플릿 예시

2020-05-30

.

Data_Engineering_TIL(20200530)

[개요]

클라우드 포메이션을 이용한 EMR 템플릿 예시

[템플릿 예시]

{
    "AWSTemplateFormatVersion": "2010-09-09",
    "Resources": {
        "PMSEMRInstance": {
            "Type": "AWS::EMR::Cluster",
            "Properties": {
                "Applications": [
                    {
                        "Name": "Hadoop"
                    },
                    {
                        "Name": "Hive"
                    },
                    {
                        "Name": "Hue"
                    },
                    {
                        "Name": "Spark"
                    },
                    {
                        "Name": "Zeppelin"
                    },
                    {
                        "Name": "Ganglia"
                    },
                    {
                        "Name": "Tez"
                    },
                    {
                        "Name": "Oozie"
                    },
                    {
                        "Name": "Presto"
                    }
                ],
                "BootstrapActions": [{
                    "Name": "PMSBootstrap",
                    "ScriptBootstrapAction": {
                        "Path": "s3://pms-example-bucket/bootstrap-example.sh"
                    }
                }],
                "Configurations": [{
                        "Classification": "yarn-site",
                        "ConfigurationProperties": {
                            "yarn.node-labels.enabled": "true",
                            "yarn.node-labels.am.default-node-label-expression": "CORE",
                            "yarn.log-aggregation.retain-seconds": "432000",
                            "yarn.scheduler.minimum-allocation-mb": "5856",
                            "yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler",
                            "yarn.scheduler.fair.user-as-default-queue": "false",
                            "yarn.scheduler.fair.preemption": "true"
                        }
                    },
                    {
                        "Classification": "mapred-site",
                        "ConfigurationProperties": {
                            "mapreduce.tasktracker.map.tasks.maximum": "4",
                            "mapreduce.tasktracker.reduce.tasks.maximum": "4",
                            "mapreduce.tasktracker.http.threads": "400"
                        }
                    },
                    {
                        "Classification": "hive-site",
                        "ConfigurationProperties": {
                            "hive.exec.dynamic.partition.mode": "nonstrict",
                            "hive.exec.max.dynamic.partitions": "21000",
                            "hive.exec.max.dynamic.partitions.pernode": "70000",
                            "hive.scratch.dir.permission": "777",
                            "hive.server2.in.place.progress": "false",
                            "hive.vectorized.execution.enabled": "true",
                            "hive.vectorized.execution.reduce.enabled": "true",
                            "hive.load.dynamic.partitions.thread": "25",
                            "hive.metastore.warehouse.dir": "s3://pms-example-bucket",
                            "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
                            "hive.metastore.schema.verification": "false",
                            "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
                            "javax.jdo.option.ConnectionURL": "jdbc:mysql://[RDS_Endpoint]:3306/hive?createDatabaseIfNotExist=true&autoReconnect=true",
                            "javax.jdo.option.ConnectionUserName": "user",
                            "javax.jdo.option.ConnectionPassword": "xxxxxxxxxxxxx",
                            "hive.merge.sparkfiles": "true",
                            "hive.merge.mapfiles": "true",
                            "hive.merge.mapredfiles": "true",
                            "hive.merge.tezfiles": "true",
                            "hive.merge.smallfiles.avgsize": "134217728"
                        }
                    },
                    {
                        "Classification": "presto-connector-hive",
                                 "ConfigurationProperties": {
                                 "hive.metastore": "glue"
                            }
                    },
                    {
                        "Classification": "oozie-site",
                        "ConfigurationProperties": {
                            "oozie.service.JPAService.jdbc.password": "qwer1234",
                            "oozie.service.JPAService.jdbc.url": "jdbc:mysql://[RDS_Endpoint example]:3306/${oozie.db.schema.name}",
                            "oozie.service.JPAService.validate.db.connection": "true",
                            "oozie.service.JPAService.create.db.schema": "true",
                            "oozie.service.JPAService.jdbc.driver": "org.mariadb.jdbc.Driver",
                            "oozie.service.JPAService.jdbc.username": "oozie",
                            "oozie.db.schema.name": "oozie",
                            "oozie.service.JPAService.pool.max.active.conn": "100",
                            "oozie.processing.timezone": "GMT+0900",
                            "oozie.action.max.output.data": "102400"
                        }
                    },
                    {
                        "Classification": "hue-ini",
                        "Configurations": [{
                            "Classification": "desktop",
                            "ConfigurationProperties": {
                                "time_zone": "Asia/Seoul"
                            },
                            "Configurations": [{
                                "Classification": "database",
                                "ConfigurationProperties": {
                                    "name": "hue",
                                    "user": "user",
                                    "password": "xxxxxxxxxx",
                                    "host": "[RDS_Endpoint_example]",
                                    "port": "3306",
                                    "engine": "mysql"
                                },
                                "Configurations": []
                            }]
                        }]
                    },
                    {
                        "Classification": "spark-defaults",
                        "ConfigurationProperties": {
                            "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
                            "spark.eventLog.enabled": "false",
                            "spark.history.fs.cleaner.enabled": "true",
                            "spark.history.fs.cleaner.interval": "3h",
                            "spark.history.fs.cleaner.maxAge": "1h"
                        }
                    }
                ],
                "Instances": {
                    "MasterInstanceFleet": {
                        "Name": "pms-master-fleet",
                        "TargetOnDemandCapacity": 1,
                        "InstanceTypeConfigs": [{
                            "EbsConfiguration": {
                                "EbsBlockDeviceConfigs": [{
                                    "VolumeSpecification": {
                                        "SizeInGB": 100,
                                        "VolumeType": "gp2"
                                    },
                                    "VolumesPerInstance": 1
                                }],
                                "EbsOptimized": true
                            },
                            "InstanceType": "r5.xlarge"
                        }]
                    },
                    "CoreInstanceFleet": {
                        "Name": "pms-core-fleet",
                        "TargetSpotCapacity": 1,
                        "InstanceTypeConfigs": [{
                        "EbsConfiguration": {
                                "EbsBlockDeviceConfigs": [{
                                    "VolumeSpecification": {
                                        "SizeInGB": 100,
                                        "VolumeType": "gp2"
                                    },
                                    "VolumesPerInstance": 1
                                }],
                                "EbsOptimized": true
                            },
                            "InstanceType": "r5.xlarge"
                        }]
                    },
                    "Ec2KeyName": "pms-ec2-keypair",
                    "Ec2SubnetId": "subnet-xxxxxxxxxxxxxxxxxx",
                    "EmrManagedMasterSecurityGroup": "sg-xxxxxxxxxxxxxxxxxx",
                    "EmrManagedSlaveSecurityGroup": "sg-xxxxxxxxxxxxxxxxxx",
                    "ServiceAccessSecurityGroup": "sg-xxxxxxxxxxxxxxxxxxxx",
                    "HadoopVersion": "2.8.5",
                    "TerminationProtected": false
                },
                "JobFlowRole": "pms-ec2-default-role",
                "LogUri": "s3://pms-test-bucket/",
                "Name": "pms-EMR",
                "ReleaseLabel": "emr-5.28.1",
                "ScaleDownBehavior": "TERMINATE_AT_TASK_COMPLETION",
                "ServiceRole": "pms-default-role",
                "Tags": [{
                    "Key": "Name",
                    "Value": "pms-EMR"
                }],
                "VisibleToAllUsers": "true"
            }
        },
        "FleetTaskInstance": {
            "Type": "AWS::EMR::InstanceFleetConfig",
            "Properties": {
                "ClusterId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "pms-task-fleet",
                "InstanceFleetType": "TASK",
                "TargetSpotCapacity" : 1,
                "LaunchSpecifications": {
                    "SpotSpecification": {
                        "TimeoutDurationMinutes": 5,
                        "TimeoutAction": "SWITCH_TO_ON_DEMAND"
                    }
                },
                "InstanceTypeConfigs": [{
                    "InstanceType": "r5.xlarge",
                    "EbsConfiguration": {
                        "EbsBlockDeviceConfigs": [{
                            "VolumeSpecification": {
                                "SizeInGB": 100,
                                "VolumeType": "gp2"
                            },
                            "VolumesPerInstance": 1
                        }],
                        "EbsOptimized": true
                    }
                }]
            }
        },
        "STEPSTATEPUSHER": {
            "Type": "AWS::EMR::Step",
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "state-pusher-script"
                    ],
                    "Jar": "command-runner.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Setup hadoop debugging"
            }
        },
        "STEPROOTVOLUME": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPSTATEPUSHER"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket/step/script",
                        "example.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket/example.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Root Volume Up"
            }
        },
        "STEPCRON": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPROOTVOLUME"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket/script",
                        "example2.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket/library/example2.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Setting Crontab for hdfs tmp clean"
            }
        },
        "STEPHUE": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPCRON"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket/",
                        "examlple3.sh",
                        "."
                    ],
                    "Jar": "s3://pmb-bucket/library/example4.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Setting Hue"
            }
        },
        "STEPOOZIE": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPHUE"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket/script",
                        "example5.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket/library/example6.jar"
                 },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Setup oozie library"
            }
        },
        "STEPSCHEDULER": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPOOZIE"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucketp/script",
                        "example7.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket-test/library/example8.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Install fair scheduler "
            }
        },
        "STEPSPARK": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPSCHEDULER"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket-test/script",
                        "exmaple9.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket-test/example10.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Setting Spark"
            }
        },
        "STEPZEPPELIN": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPSPARK"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket-test/script",
                        "example11.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket-test/example12.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Install zeppelin"
            }
        },
        "STEPHADOOP": {
            "Type": "AWS::EMR::Step",
            "DependsOn": [
                "STEPZEPPELIN"
            ],
            "Properties": {
                "ActionOnFailure": "CONTINUE",
                "HadoopJarStep": {
                    "Args": [
                        "s3://pms-bucket-test/script",
                        "example14.sh",
                        "."
                    ],
                    "Jar": "s3://pms-bucket-test/example13.jar"
                },
                "JobFlowId": {
                    "Ref": "PMSEMRInstance"
                },
                "Name": "Setting Hadoop"
            }
        }
    }
}