hive电商项目:1数仓构建过程

2019-04-13 15:18发布

1、获取数据源 后台的服务数据 前台的点击流日志数据 业务数据 2、确定主题 用户主题:用户、会员相关的信息 订单主题:订单相关的信息 浏览器主题:跟浏览器相关的信息 事件主题:跟事件相关 3、创建模型(创建表) 100张表以内不分层,以外将分层。 如果维度表较多则将维度表单独分层 1、创建维度层: dim_维度表 地域维度:拆分成 省 市 浏览器维度:浏览器名称、版本 时间维度:周、季度 事件维度:category(种类)、action 平台维度: kpi:(指标) 创建ods库:注:ods=operate database store 创建dw库:dw=data warehouse 数据仓库 创建dm库: dm=date manipulation数据操作 命名规则: 库名_主题_表名 ods_user_addr_ ODS_USER_ADDR_(大小写均可,但最好小写) ods.user.addr不行 事实表 create database if not exists dim;//纬度 create database if not exists ods;//操作数据库存储 create database if not exists dw;//数据仓库 create database if not exists dm;//数据操作 在维度库下面创建维度表:注:``字段带有特殊符号用这个解决,这里desc是关键字,所以用`` CREATE TABLE IF NOT EXISTS `dim_province` ( `id` int, `province` string, `country_id` int, `desc` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_city` ( `id` int, `city` string, `desc` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_province_city` ( `dim_region_id` bigint, `dim_region_city_name` string, `dim_region_province_name` string, `dim_region_country_name` string, `dim_region_city_id` string, `dim_region_province_id` string, `dim_region_country_id` string, `dim_region_date` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_platform` ( `id` int, `platform_name` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_kpi` ( `id` int, `kpi_name` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_event_name` ( `id` int, `name` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_event_category` ( `id` int, `category` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_event_action` ( `id` int, `action` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_browser_name` ( `id` int, `browser_name` string, `browser_version_id` int ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_browser_version` ( `id` int, `browser_version` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_order` ( `oid` bigint, `on` string, `cut_id` bigint, `cua_id` bigint, `browser_version` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_currency_type` ( `id` int, `currency_name` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_payment_type` ( `id` int, `payment_type` string ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_week`( STD_WEEK_CODE STRING, STD_WEEK_NAME STRING, BEGIN_DATE STRING, END_DATE STRING, NOTES STRING, IS_DISPLAY INT, DISPLAY_ORDER INT, IS_VALID INT, UPDATE_DATE STRING, LAST_STD_WEEK_CODE STRING ) row format delimited fields terminated by ' ' ; CREATE TABLE IF NOT EXISTS `dim_userinfo` ( `uid` String, `uname` string ) row format delimited fields terminated by ' ' ; load data local inpath '/root/dim/dim_province' into table dim_province; load data local inpath '/root/dim/dim_city' into table dim_city; load data local inpath '/root/dim/dim_province_city' into table dim_province_city; load data local inpath '/root/dim/dim_platform' into table dim_platform; load data local inpath '/root/dim/dim_kpi' into table dim_kpi; load data local inpath '/root/dim/dim_event_name' into table dim_event_name; load data local inpath '/root/dim/dim_browser_name' into table dim_browser_name; load data local inpath '/root/dim/dim_browser_version' into table dim_browser_version; load data local inpath '/root/dim/dim_userinfo' into table dim_userinfo; --------------------------------------------------- 在ods层创建数据表:这里用到对元数据进行处理(UDF) create table if not exists ods_font_log( ip string, ts string, server_ip string, url string ) row format delimited fields terminated by 'u0001' stored as textfile ; create table if not exists ods_end_log( ip string, ts string, server_ip string, url string ) row format delimited fields terminated by 'u0001' stored as textfile ; create table if not exists ods_logs( ip string, ts string, server_ip string, url string ) row format delimited fields terminated by ' ' stored as orc ; insert into ods_logs select from ods_font_log union all select from ods_end_log ; udf: ip url解析 或者用mr清洗数据: -------------------------------------------------- create table if not exists ods_logs( ver string, s_time string, en string, u_ud string, u_mid string, u_sd string, c_time string, l string, b_iev string, b_rst string, p_url string, p_ref string, tt string, pl string, ip string, oid string, `on` string, cua string, cut string, pt string, ca string, ac string, kv_ string, du string, browser_name string, browser_version string, os_name string, os_version string, country string, province string, city string ) row format delimited fields terminated by 'u0001' stored as textfile ; 注: 删除内部表会直接删除元数据(metadata)及存储数据; 删除外部表仅仅会删除元数据,HDFS上的文件并不会被删除; create external table if not exists ods_logs_orc( ver string, s_time string, en string, u_ud string, u_mid string, u_sd string, c_time string, l string, b_iev string, b_rst string, p_url string, p_ref string, tt string, pl string, ip string, oid string, `on` string, cua string, cut string, pt string, ca string, ac string, kv_ string, du string, browser_name string, browser_version string, os_name string, os_version string, country string, province string, city string ) partitioned by (month string,day string) row format delimited fields terminated by 'u0001' stored as orc ; 导入原始数据: #load data inpath '/ods/11/09/' into table ods_logs; 我因为没跑mr程序所以没有这个数据,让同学给了一份:/root/dim/part-m-00000 load data local inpath '/root/dim/part-m-00000' into table ods_logs; 将数据导入到orc表中 ----------------------- from ods_logs insert into ods_logs_orc( ver , s_time , en , u_ud , u_mid , u_sd , c_time , l , b_iev , b_rst , p_url , p_ref , tt , pl , ip , oid , `on` , cua , cut , pt , ca , ac , kv_ , du , browser_name , browser_version , os_name , os_version , country , province , city ) partition (month=1,day='01') select ver , s_time , en , u_ud , u_mid , u_sd , c_time , l , b_iev , b_rst , p_url , p_ref , tt , pl , ip , oid , `on` , cua , cut , pt , ca , ac , kv_ , du , browser_name , browser_version , os_name , os_version , country , province , city ; ------------------ 上面的报错,可以直接简写为: from ods_logs insert into ods_logs_orc partition(month=1,day=1) select * ; 创建dw层的数据 ##为新增用户、总用户、活跃用户做计算: create table if not exists dwd_user( pl string, pl_id string, en string, en_id string, browser_name string, browser_id string, browser_version string, browser_version_id string, province_name string, province_id string, city_name string, city_id string, uid string ) partitioned by (month string,day string) row format delimited fields terminated by 'u0001' stored as orc ; 导入数据: from ( select ol.pl, dp.id as pl_id, ol.en, de.id as en_id, ol.browser_name, db.id browser_id, ol.browser_version , dv.id browser_version_id, ol.province , pv.id province_id, ol.city , dc.id city_id, ol.u_ud from ods.ods_logs_orc ol left join dim.dim_platform dp on dp.platform_name = ol.pl left join dim.dim_event_name de on de.name = ol.en left join dim.dim_browser_name db on db.browser_name = ol.browser_name left join dim.dim_browser_version dv on dv.browser_version = ol.browser_version left join dim.dim_province pv on pv.province = ol.province left join dim.dim_city dc on dc.city = ol.city ) tmp insert into dwd_user partition (month='1',day = '01') select * ; ----------------------- 另一种写法: with tmp as ( select ol.pl, dp.id as pl_id, ol.en, de.id as en_id, ol.browser_name, db.id browser_id, ol.browser_version , dv.id browser_version_id, ol.province_name , pv.id province_id, ol.city_name , dc.id city_id, ol.uid from ods_logs_orc ol left join dim_platform dp on dp.platform_name = ol.pl left join dim_event_name de on de.name = ol.en left join dim_browser_name db on db.browser_name = ol.browser_Name left join dim_browser_version dv on dv.browser_version = ol.browser_version left join dim_province pv on pv.province = ol.province_name left join dim_city dc on dc.city = ol.city_name ) insert into dwd_user partition (month='1',day = '01') select * from tmp ; -------------------------------