使用TPC_DS产生压测数据
下载TPC-DS压测工具:(测试版本 2.11.0)
http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp
解压之后在tools目录下执行make进行编译
编译之后使用dsdgen产生数据,其中 -scale 指定产生多大的数据,比如下面产生 1000GB,-parallel指定将数据切成4个片段,-child指定当前产生第几个片段
./dsdgen -dir /home/ec2-user/data/ -scale 1000 -parallel 4 -child 1
./dsdgen -dir /home/ec2-user/data/ -scale 1000 -parallel 4 -child 2
./dsdgen -dir /home/ec2-user/data/ -scale 1000 -parallel 4 -child 3
./dsdgen -dir /home/ec2-user/data/ -scale 1000 -parallel 4 -child 4
产生查询脚本:
./dsqgen -output_dir /home/ec2-user/query/ -input ../query_templates/templates.lst -scale 1 -DIALECT netezza -DIRECTORY ../query_templates -QUALIFY y -VERBOSE Y
或者用这个方法产生
for id in `seq 1 99`; do ./dsqgen -DIRECTORY ../query_templates -TEMPLATE "query$id.tpl" -DIALECT netezza -FILTER Y > ~/query/"query$id.sql"; done
如果遇到_END的报错,那么在../query_templates路径下,对所有的tpl文件后面添加一行: define _END = "";,使用脚本做如下:
ls | while read line; do echo "define _END = \"\";" >> $line; done
建表语句
创建表的语句在 tools文件夹下的 tpcds.sql 文件中,以及tpcds_source.sql 文件中
TPC-DS 工具说明
有关工具的使用以及目录结构说明,参考specification文件夹下的 specification.pdf 文件中的表格:Table 0-1 Electronically Available Specification Material
有关产生1G,1T,3T,10T,30T,100T的文件,每个表数据量大小,可以参考 specification文件夹下的 specification.pdf 文件中的 Table 3-2 Database Row Counts 这个表格来对比。
附录:在AWS EMR创建Hive表:
在AWS Global账号下启动过EMR进行测试的命令
aws emr create-cluster --auto-scaling-role EMR_AutoScaling_DefaultRole \
--applications Name=Hadoop Name=Hive Name=Hue Name=Ganglia Name=Spark Name=Presto --ebs-root-volume-size 10 --ec2-attributes '{"KeyName":"Virginia","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-fc22e0a4","EmrManagedSlaveSecurityGroup":"sg-68c8f923","EmrManagedMasterSecurityGroup":"sg-08c2f343"}' \
--service-role EMR_DefaultRole \
--enable-debugging --log-uri 's3://aws-logs-372809795158-us-east-1/elasticmapreduce/' \
--name 'test-hive-spark' \
--release-label emr-5.25.0 \
--instance-groups '[{"InstanceCount":5,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":1000,"VolumeType":"gp2"},"VolumesPerInstance":1}],"EbsOptimized":true},"InstanceGroupType":"CORE","InstanceType":"c5.2xlarge","Name":"核心实例组 - 2"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":4}]},"InstanceGroupType":"MASTER","InstanceType":"c5.2xlarge","Name":"主实例组 - 1"}]' \
--configurations '[{"Classification":"hive-site","Properties":{"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"}},{"Classification":"spark-hive-site","Properties":{"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"}}]' \
--steps '[{"Args":["s3-dist-cp","--src","s3://xlaws/data/catalog_sales","--dest","hdfs:///catalog_sales"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"S3DistCpSales"},{"Args":["s3-dist-cp","--src","s3://xlaws/data/warehouse","--dest","hdfs:///warehouse"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"S3DistCpWarehouse"},{"Args":["s3-dist-cp","--src","s3://xlaws/data/date_dim","--dest","hdfs:///date_dim"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"S3DistCpDate"},{"Args":["s3-dist-cp","--src","s3://xlaws/data/ship_mode","--dest","hdfs:///ship_mode"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"S3DistCpShip"},{"Args":["s3-dist-cp","--src","s3://xlaws/data/call_center","--dest","hdfs:///call_center"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"S3DistCpCallCenter"}]' \
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 --profile global
创建S3上的表
创建catalog_sales表
CREATE EXTERNAL TABLE `catalog_sales`(
`cs_sold_date_sk` bigint,
`cs_sold_time_sk` bigint,
`cs_ship_date_sk` bigint,
`cs_bill_customer_sk` bigint,
`cs_bill_cdemo_sk` bigint,
`cs_bill_hdemo_sk` bigint,
`cs_bill_addr_sk` bigint,
`cs_ship_customer_sk` bigint,
`cs_ship_cdemo_sk` bigint,
`cs_ship_hdemo_sk` bigint,
`cs_ship_addr_sk` bigint,
`cs_call_center_sk` bigint,
`cs_catalog_page_sk` bigint,
`cs_ship_mode_sk` bigint,
`cs_warehouse_sk` bigint,
`cs_item_sk` bigint,
`cs_promo_sk` bigint,
`cs_order_number` bigint,
`cs_quantity` bigint,
`cs_wholesale_cost` double,
`cs_list_price` double,
`cs_sales_price` double,
`cs_ext_discount_amt` double,
`cs_ext_sales_price` double,
`cs_ext_wholesale_cost` double,
`cs_ext_list_price` double,
`cs_ext_tax double` double,
`cs_coupon_amt` double,
`cs_ext_ship_cost` double,
`cs_net_paid` double,
`cs_net_paid_inc_tax` double,
`cs_net_paid_inc_ship` double,
`cs_net_paid_inc_ship_tax` double,
`cs_net_profit double` double)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://xlaws/data/catalog_sales';
创建call_center 表
CREATE EXTERNAL TABLE `call_center`(
`cc_call_center_sk` bigint,
`cc_call_center_id` string,
`cc_rec_start_date` string,
`cc_rec_end_date` string,
`cc_closed_date_sk` string,
`cc_open_date_sk` bigint,
`cc_name` string,
`cc_class` string,
`cc_employees` bigint,
`cc_sq_ft` bigint,
`cc_hours` string,
`cc_manager` string,
`cc_mkt_id` bigint,
`cc_mkt_class` string,
`cc_mkt_desc` string,
`cc_market_manager` string,
`cc_division` bigint,
`cc_division_name` string,
`cc_company` bigint,
`cc_company_name` string,
`cc_street_number` bigint,
`cc_street_name` string,
`cc_street_type` string,
`cc_suite_number` string,
`cc_city` string,
`cc_county` string,
`cc_state` string,
`cc_zip` bigint,
`cc_country` string,
`cc_gmt_offset` bigint,
`cc_tax_percentage` double)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://xlaws/data/call_center/';
创建 date_dim 表
CREATE EXTERNAL TABLE `date_dim`(
`d_date_sk` bigint,
`d_date_id` string,
`d_date` string,
`d_month_seq` bigint,
`d_week_seq` bigint,
`d_quarter_seq` bigint,
`d_year` bigint,
`d_dow` bigint,
`d_moy` bigint,
`d_dom` bigint,
`d_qoy` bigint,
`d_fy_year` bigint,
`d_fy_quarter_seq` bigint,
`d_fy_week_seq` bigint,
`d_day_name` string,
`d_quarter_name` string,
`d_holiday` string,
`d_weekend` string,
`d_following_holiday` string,
`d_first_dom` bigint,
`d_last_dom` bigint,
`d_same_day_ly` bigint,
`d_same_day_lq` bigint,
`d_current_day` string,
`d_current_week` string,
`d_current_month` string,
`d_current_quarter` string,
`d_current_year` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://xlaws/data/date_dim/';
创建ship_mode表
CREATE EXTERNAL TABLE `ship_mode`(
`sm_ship_mode_sk` bigint,
`sm_ship_mode_id` string,
`sm_type` string,
`sm_code` string,
`sm_carrier` string,
`sm_contract` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://xlaws/data/ship_mode/';
创建ware_house表
CREATE EXTERNAL TABLE `warehouse`(
`w_warehouse_sk` bigint,
`w_warehouse_id` string,
`w_warehouse_name` string,
`w_warehouse_sq_ft` bigint,
`w_street_number` bigint,
`w_street_name` string,
`w_street_type` string,
`w_suite_number` string,
`w_city` string,
`w_county` string,
`w_state` string,
`w_zip` bigint,
`w_country` string,
`w_gmt_offset` bigint)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://xlaws/data/warehouse/';
最后更新于