• 基于Hadoop生态圈的数据仓库实践 —— 环境搭建(三)笔记


    -- 建立源库表
    USE source;

    -- 建立客户表
    CREATE TABLE customer (
    customer_number INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '客户编号,主键',
    customer_name VARCHAR(50) comment '客户名称',
    customer_street_address VARCHAR(50) comment '客户住址',
    customer_zip_code INT comment '邮编',
    customer_city VARCHAR(30) comment '所在城市',
    customer_state VARCHAR(2) comment '所在省份' );

    -- 建立产品表
    CREATE TABLE product (
    product_code INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '产品编码,主键',
    product_name VARCHAR(30) comment '产品名称',
    product_category VARCHAR(30) comment '产品类型' );

    -- 建立销售订单表
    CREATE TABLE sales_order (
    order_number INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '订单号,主键',
    customer_number INT comment '客户编号',
    product_code INT comment '产品编码',
    order_date DATETIME comment '订单日期',
    entry_date DATETIME comment '登记日期',
    order_amount DECIMAL(10 , 2 ) comment '销售金额',
    foreign key (customer_number)
    references customer (customer_number)
    on delete cascade on update cascade,
    foreign key (product_code)
    references product (product_code)
    on delete cascade on update cascade );

    -- 新增客户数据 
    INSERT INTO customer (customer_name, customer_street_address, customer_zip_code, customer_city, customer_state) VALUES ('Really Large Customers', '7500 Louise Dr.',17050, 'Mechanicsburg','PA'), ('Small Stores', '2500 Woodland St.',17055, 'Pittsburgh','PA'), ('Medium Retailers','1111 Ritter Rd.',17055,'Pittsburgh','PA'), ('Good Companies','9500 Scott St.',17050,'Mechanicsburg','PA'), ('Wonderful Shops','3333 Rossmoyne Rd.',17050,'Mechanicsburg','PA'), ('Loyal Clients','7070 Ritter Rd.',17055,'Pittsburgh','PA'), ('Distinguished Partners','9999 Scott St.',17050,'Mechanicsburg','PA');

    -- 生成产品表测试数据
    INSERT INTO product (product_name, product_category ) VALUES ('Hard Disk Drive', 'Storage'), ('Floppy Drive', 'Storage'), ('LCD Panel', 'Monitor');


    -- 生成100条销售订单表测试数据
    DROP PROCEDURE IF EXISTS generate_sales_order_data;

    delimiter //
    DROP PROCEDURE IF EXISTS generate_sales_order_data;
    create procedure generate_sales_order_data()
    begin
    DROP TABLE IF EXISTS temp_sales_order_data;
    CREATE TABLE temp_sales_order_data AS SELECT * FROM sales_order WHERE 1=0;
    SET @start_date := unix_timestamp('2016-03-01');
    SET @end_date := unix_timestamp('2016-07-01');
    SET @i := 1;
    while @i<=100 do
    SET @customer_number := floor(1 + rand() * 6);
    SET @product_code := floor(1 + rand() * 2);
    SET @order_date := from_unixtime(@start_date + rand() * (@end_date - @start_date));
    SET @amount := floor(1000 + rand() * 9000);
    INSERT INTO temp_sales_order_data VALUES (@i,@customer_number,@product_code,@order_date,@order_date,@amount);
    SET @i:=@i+1;
    END while;
    TRUNCATE TABLE sales_order;
    INSERT INTO sales_order SELECT NULL,customer_number,product_code,order_date,entry_date,order_amount FROM temp_sales_order_data ORDER BY order_date;
    select * from sales_order;
    COMMIT;
    end//


    delimiter ;
    call generate_sales_order_data();


    -- 建立RDS库表
    USE rds;
    -- 建立客户过渡表
    CREATE TABLE customer ( customer_number INT comment 'number', customer_name VARCHAR(30) comment 'name', customer_street_address VARCHAR(30) comment 'address', customer_zip_code INT comment 'zipcode', customer_city VARCHAR(30) comment 'city', customer_state VARCHAR(2) comment 'state');
    -- 建立产品过渡表
    CREATE TABLE product ( product_code INT comment 'code', product_name VARCHAR(30) comment 'name', product_category VARCHAR(30) comment 'category' );
    -- 建立销售订单过渡表
    CREATE TABLE sales_order ( order_number INT comment 'order number', customer_number INT comment 'customer number', product_code INT comment 'product code', order_date TIMESTAMP comment 'order date', entry_date TIMESTAMP comment 'entry date', order_amount DECIMAL(10 , 2 ) comment 'order amount');


    -- 建立数据仓库表
    USE dw;
    -- 建立客户维度表
    CREATE TABLE customer_dim ( customer_sk INT comment 'surrogate key', customer_number INT comment 'number', customer_name VARCHAR(50) comment 'name', customer_street_address VARCHAR(50) comment 'address', customer_zip_code INT comment 'zipcode', customer_city VARCHAR(30) comment 'city', customer_state VARCHAR(2) comment 'state', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' )CLUSTERED BY (customer_sk) INTO 8 BUCKETS STORED AS ORC ;

    -- 建立产品维度表
    CREATE TABLE product_dim ( product_sk INT comment 'surrogate key', product_code INT comment 'code', product_name VARCHAR(30) comment 'name', product_category VARCHAR(30) comment 'category', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' )CLUSTERED BY (product_sk) INTO 8 BUCKETS STORED AS ORC;

    -- 建立订单维度表
    CREATE TABLE order_dim ( order_sk INT comment 'surrogate key', order_number INT comment 'number', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' )CLUSTERED BY (order_sk) INTO 8 BUCKETS STORED AS ORC ;

    -- 建立销售订单事实表
    CREATE TABLE sales_order_fact ( order_sk INT comment 'order surrogate key', customer_sk INT comment 'customer surrogate key', product_sk INT comment 'product surrogate key', order_date_sk INT comment 'date surrogate key', order_amount DECIMAL(10 , 2 ) comment 'order amount' )CLUSTERED BY (order_sk) INTO 8 BUCKETS STORED AS ORC;

    #########date_dim_generate.sh########
    date1="$1"
    date2="$2"
    tempdate=`date -d "$date1" +%F`
    tempdateSec=`date -d "$date1" +%s`
    enddateSec=`date -d "$date2" +%s`
    min=1
    max=`expr ( $enddateSec - $tempdateSec ) / ( 24 * 60 * 60 ) + 1`
    cat /dev/null > ./date_dim.csv

    #echo max
    while [ $min -le $max ]
    do
    month=`date -d "$tempdate" +%m`
    month_name=`date -d "$tempdate" +%B`
    quarter=`echo $month | awk '{print int (($0-1)/3)+1}'`
    year=`date -d "$tempdate" +%Y`
    echo ${min}","${tempdate}","${month}","${month_name}","${quarter}","${year} >> ./date_dim.csv
    tempdate=`date -d "+$min day $date1" +%F`
    tempdateSec=`date -d "+$min day $date1" +%s`
    min=`expr $min + 1`
    done


    ##########create_table_date_dim.sql#############
    drop table if exists date_dim;
    create table date_dim (
    date_sk int comment 'surrogate key',
    date date comment 'date,yyyy-mm-dd',
    month tinyint comment 'month',
    month_name varchar(9) comment 'month name',
    quarter tinyint comment 'quarter',
    year smallint comment 'year')comment 'date dimension table'row format delimited fields terminated by ',' stored as textfile;

    ./date_dim_generate.sh 2000-01-01 2020-12-31

    beeline -u jdbc:hive2://localhost:10000/dw -f create_table_date_dim.sql --silent

    hdfs dfs -put -f date_dim.csv /user/hive/warehouse/dw.db/date_dim/

    转载:https://blog.csdn.net/wzy0623/article/details/51783410

  • 相关阅读:
    一个不错的谈架构的链接
    监控报警平台设计思路
    从数据库到NoSQL思路整理
    大数据流式计算:关键技术及系统实例
    接口和类
    学习的逻辑
    [kuangbin带你飞]专题五 并查集 J
    [kuangbin带你飞]专题五 并查集 E
    [kuangbin带你飞]专题五 并查集 D
    [kuangbin带你飞]专题五 并查集 C
  • 原文地址:https://www.cnblogs.com/chengjunhao/p/9734967.html
Copyright © 2020-2023  润新知