2016-05-16
最近公司有个项目,要爬取7个视频网站、3个图片网站,关于5个关键词的视频、图片数据,并将爬取到的视频、图片做成展示墙,放到企业客户的iPad上展示用。以下是相关的etl过程,写了一整天,纪念一下。哈哈~
1、SQL语句:ddl_social_scan
-- CREATE TABLE social_scan AS -- 1_in_Maybelline SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, '.info/in/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'in' AND brand = 'Maybelline' ORDER BY volume DESC LIMIT 20 ) 1_in_Maybelline UNION ALL -- 2_in_Loreal SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, '.info/in/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'in' AND brand = 'Loreal' ORDER BY volume DESC LIMIT 20 ) 2_in_Loreal UNION ALL -- 3_in_Meiji SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, '.info/in/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'in' AND brand = 'Meiji' ORDER BY volume DESC LIMIT 20 ) 3_in_Meiji UNION ALL -- 4_in_NYX SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, '.info/in/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'in' AND brand = 'NYX' ORDER BY volume DESC LIMIT 20 ) 4_in_NYX UNION ALL -- 5_nice_Maybelline SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, 'upload/show/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'nice' AND brand = 'Maybelline' ORDER BY volume DESC LIMIT 20 ) 5_nice_Maybelline UNION ALL -- 6_nice_Loreal SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, 'upload/show/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'nice' AND brand = 'Loreal' ORDER BY volume DESC LIMIT 20 ) 6_nice_Loreal UNION ALL -- 7_nice_Meiji SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, 'upload/show/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'nice' AND brand = 'Meiji' ORDER BY volume DESC LIMIT 20 ) 7_nice_Meiji UNION ALL -- 8_nice_NYX SELECT * FROM ( SELECT source AS website, ( CASE brand WHEN 'Maybelline' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, tag_name AS title, ROUND( IFNULL( ( POWER(IFNULL(pic_like_num, 0), 2) + POWER( IFNULL(pic_comments_num, 0), 2 ) + POWER(IFNULL(browse_num, 0), 2) ) / ( IFNULL(pic_like_num, 0) + IFNULL(pic_comments_num, 0) + IFNULL(browse_num, 0) ), 0 ), 0 ) AS volume, IFNULL(pic_like_num, 0) AS dianzan_num, IFNULL(pic_comments_num, 0) AS comment_num, IFNULL(browse_num, 0) AS browse_num, 1 / 0 AS stay_time, REPLACE ( LEFT ( SUBSTRING_INDEX( picture.nice.pic_url, 'upload/show/', - 1 ), 10 ), '/', '' ) AS upload_date, pic_url AS download_url, REPLACE (date_time, '-', '') AS sys_date, 'picture' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM picture.nice WHERE source = 'nice' AND brand = 'Meiji' ORDER BY volume DESC LIMIT 20 ) 8_nice_NYX UNION ALL -- 9_美拍_美宝莲 SELECT * FROM ( SELECT DISTINCT source AS website, ( CASE keyword WHEN '美宝莲' THEN '美宝莲' WHEN 'Loreal' THEN '欧莱雅' WHEN 'Meiji' THEN '美即' WHEN 'NYX' THEN 'NYX' ELSE 'none' END ) AS keyword, title AS title, ROUND( IFNULL( ( POWER(IFNULL(zanCount, 0), 2) + POWER(IFNULL(commentCount, 0), 2) + POWER(IFNULL(watchCount, 0), 2) + POWER(IFNULL(netCount, 0), 2) + POWER(IFNULL(favCount, 0), 2) + POWER(IFNULL(tanmuCount, 0), 2) + POWER(IFNULL(shareCount, 0), 2) ) / ( IFNULL(zanCount, 0) + IFNULL(commentCount, 0) + IFNULL(watchCount, 0) + IFNULL(netCount, 0) + IFNULL(favCount, 0) + IFNULL(tanmuCount, 0) + IFNULL(shareCount, 0) ), 0 ), 0 ) AS volume, IFNULL(zanCount, 0) AS dianzan_num, IFNULL(commentCount, 0) AS comment_num, IFNULL(watchCount, 0) AS browse_num, length AS stay_time, REPLACE (uploadTime, '-', '') AS upload_date, downUrl AS download_url, REPLACE (LEFT(cralwTime, 10), '-', '') AS sys_date, 'media' AS file_type, 1 / 0 AS media_file_name, 1 / 0 AS save_dir, 0 AS video_not_found FROM media.7videos WHERE source = '美拍' AND keyword = '美宝莲' AND downUrl <> '' ORDER BY volume DESC LIMIT 20 ) 9_美拍_美宝莲