Python+Spark2.0+hadoop学习笔记——Spark ML Pipeline机器学习流程

Array ( [query] => Array ( [match] => Array ( [text] => Array ( [query] => Python+Spark2.0+hadoop学习笔记——Spark ML Pipeline机器学习流程 ) ) ) [highlight] => Array ( [fields] => Array ( [text] => stdClass Object ( ) ) [pre_tags] => #em# [post_tags] => #/em# ) [size] => 8 [from] => 0 )

string(8295) "{"took":64,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":10000,"relation":"gte"},"max_score":72.81947,"hits":[{"_index":"likecs_art_db","_type":"_doc","_id":"99154","_score":72.81947,"_source":{"id":"99154","text":"Python+Spark2.0+hadoop\u5b66\u4e60\u7b14\u8bb0\u2014\u2014Spark ML Pipeline\u673a\u5668\u5b66\u4e60\u6d41\u7a0b","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"zhuozige","tagsname":"","tagsid":"[]","catesname":"","catesid":"[]","createtime":"1586423157"},"highlight":{"text":["#em#Python#/em#+#em#Spark2.0#/em#+#em#hadoop#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#——#em#Spark#/em# #em#ML#/em# #em#Pipeline#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#流#/em##em#程#/em#"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"526169","_score":40.620884,"_source":{"id":"526169","text":"Spark ML\u673a\u5668\u5b66\u4e60","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"Finley","tagsname":"","tagsid":"","catesname":"","catesid":"","createtime":"1634603622"},"highlight":{"text":["#em#Spark#/em# #em#ML#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em#"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"846660","_score":36.96408,"_source":{"id":"846660","text":"Spark2.0\u5b66\u4e60\uff08\u4e00\uff09--------Spark\u7b80\u4ecb","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"tree1123","tagsname":null,"tagsid":"","catesname":null,"catesid":"","createtime":"1637564915"},"highlight":{"text":["#em#Spark2.0#/em##em#学#/em##em#习#/em#（一）--------#em#Spark#/em#简介"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"99472","_score":33.987568,"_source":{"id":"99472","text":"Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1a\u96c6\u6210\u5b66\u4e60\u603b\u7ed3\n \n\n\n\n\n\n Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\u2014\u2014\u968f\u673a\u68ee\u6797\u7b97\u6cd5Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aXgBoost\u7b97\u6cd5Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aAdaboost\u7b97\u6cd5","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"wj-1314","tagsname":null,"tagsid":"","catesname":"","catesid":"","createtime":"1641130124"},"highlight":{"text":["#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：集成#em#学#/em##em#习#/em#总结\n \n\n\n\n\n\n #em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#——随#em#机#/em#森林算法#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：XgBoost算法#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：Adaboost"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"867800","_score":33.837803,"_source":{"id":"867800","text":"Spark.ML\u4e4bPipeLine\u5b66\u4e60\u7b14\u8bb0","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"kongchung","tagsname":"","tagsid":"","catesname":null,"catesid":"","createtime":"1638067467"},"highlight":{"text":["Spark.ML之#em#PipeLine#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"852936","_score":33.267914,"_source":{"id":"852936","text":"Hadoop\/Spark\u5165\u95e8\u5b66\u4e60\u7b14\u8bb0(\u5b8c\u7ed3)","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"limitlessun","tagsname":"","tagsid":"","catesname":"","catesid":"","createtime":"1639334914"},"highlight":{"text":["#em#Hadoop#/em#/#em#Spark#/em#入门#em#学#/em##em#习#/em##em#笔#/em##em#记#/em#(完结)"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"60968","_score":32.83477,"_source":{"id":"60968","text":"Spark\u5b66\u4e60\u7b14\u8bb0","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"killianxu","tagsname":"","tagsid":"[]","catesname":"\u5927\u6570\u636e","catesid":"[\"224\"]","createtime":"1560562336"},"highlight":{"text":["#em#Spark#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"114463","_score":32.83477,"_source":{"id":"114463","text":"SPARK\u5b66\u4e60\u7b14\u8bb0","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"Hailong-Said","tagsname":"","tagsid":"[]","catesname":"\u5927\u6570\u636e","catesid":"[\"224\"]","createtime":"1601092932"},"highlight":{"text":["#em#SPARK#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#"]}}]}}"

string(9131) "{"took":66,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":10000,"relation":"gte"},"max_score":72.81947,"hits":[{"_index":"likecs_art_db","_type":"_doc","_id":"203459472","_score":32.790146,"_source":{"id":"203459472","text":"ML\u7b14\u8bb0 00\uff1a\u673a\u5668\u5b66\u4e60\u7c7b\u578b & \u6df1\u5ea6\u5b66\u4e60\u548c\u4f20\u7edf\u673a\u5668\u5b66\u4e60\u7684\u533a\u522b","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"","tagsname":null,"tagsid":"","catesname":null,"catesid":"","createtime":"1630722495"},"highlight":{"text":["#em#ML#/em##em#笔#/em##em#记#/em# 00：#em#机#/em##em#器#/em##em#学#/em##em#习#/em#类型 & 深度#em#学#/em##em#习#/em#和传统#em#机#/em##em#器#/em##em#学#/em##em#习#/em#的区别"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"46168","_score":32.71071,"_source":{"id":"46168","text":"Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1asklearn\u5e93\u7684\u5b66\u4e60","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"wj-1314","tagsname":null,"tagsid":"","catesname":"","catesid":"","createtime":"1638186624"},"highlight":{"text":["#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：sklearn库的#em#学#/em##em#习#/em#"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"830364","_score":32.683475,"_source":{"id":"830364","text":"\u300aPython\u673a\u5668\u5b66\u4e60\u300b\u7b14\u8bb0\uff08\u4e94\uff09","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"yifdu25","tagsname":null,"tagsid":"","catesname":null,"catesid":"","createtime":"1637384465"},"highlight":{"text":["《#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em#》#em#笔#/em##em#记#/em#（五）"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"203488362","_score":32.683475,"_source":{"id":"203488362","text":"Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\u4e8c","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"","tagsname":null,"tagsid":"","catesname":null,"catesid":"","createtime":"1639306053"},"highlight":{"text":["#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#二"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"688238","_score":32.502426,"_source":{"id":"688238","text":"Hadoop\u5b66\u4e60\u7b14\u8bb0\u7cfb\u5217\n \n\n\n\n\nHadoop\u5b66\u4e60\u7b14\u8bb0\u7cfb\u5217","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"think90","tagsname":"","tagsid":"","catesname":null,"catesid":"","createtime":"1635923347"},"highlight":{"text":["#em#Hadoop#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#系列\n \n\n\n\n\n#em#Hadoop#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#系列"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"107331","_score":32.502174,"_source":{"id":"107331","text":"Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aSVM\uff081\uff09\u2014\u2014SVM\u6982\u8ff0\n \n\n\n\n\n\n Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aSVM\uff081\uff09\u2014\u2014SVM\u6982\u8ff0Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aSVM\uff082\uff09\u2014\u2014SVM\u6838\u51fd\u6570Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aSVM\uff083\uff09\u2014\u2014\u8bc1\u660eSVMPython\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aSVM\uff084\uff09\u2014\u2014sklearn\u5b9e\u73b0Python\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\uff1aLogistic Regression","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"wj-1314","tagsname":null,"tagsid":"","catesname":"","catesid":"","createtime":"1637937618"},"highlight":{"text":["#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：SVM（1）——SVM概述\n \n\n\n\n\n\n #em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：SVM（1）——SVM概述#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：SVM（2）——SVM核函数","#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：SVM（3）——证明SVMPython#em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：SVM（4）——sklearn实现#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#：Logistic Regression"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"688245","_score":32.389423,"_source":{"id":"688245","text":"Hadoop\u5b66\u4e60\u7b14\u8bb0","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"Java-Starter","tagsname":"","tagsid":"","catesname":null,"catesid":"","createtime":"1635923385"},"highlight":{"text":["#em#Hadoop#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#"]}},{"_index":"likecs_art_db","_type":"_doc","_id":"755957","_score":32.336716,"_source":{"id":"755957","text":"\u3010\u673a\u5668\u5b66\u4e60\u7b14\u8bb0\u3011Python\u673a\u5668\u5b66\u4e60\u57fa\u672c\u8bed\u6cd5","intro":"\u76ee\u5f55\n\nECharts\n\u5f02\u6b65\u52a0\u8f7d\n\n\n\nECharts\r\n\u6570\u636e\u53ef\u89c6\u5316\u5728\u8fc7\u53bb\u51e0\u5e74\u4e2d\u53d6\u5f97\u4e86\u5de8\u5927\u8fdb\u5c55\u3002\u5f00\u53d1\u4eba\u5458\u5bf9\u53ef\u89c6\u5316\u4ea7\u54c1\u7684\u671f\u671b\u4e0d\u518d\u662f\u7b80\u5355\u7684\u56fe\u8868\u521b\u5efa\u5de5\u5177\uff0c\u800c\u662f\u5728\u4ea4\u4e92\u3001\u6027\u80fd\u3001\u6570\u636e\u5904\u7406\u7b49\u65b9\u9762\u6709\u66f4\u9ad8\u7684\u8981\u6c42\u3002\r\nchart.setOption({\r\n color: [\r\n ","username":"carmen-019","tagsname":"","tagsid":"","catesname":null,"catesid":"","createtime":"1636577151"},"highlight":{"text":["【#em#机#/em##em#器#/em##em#学#/em##em#习#/em##em#笔#/em##em#记#/em#】#em#Python#/em##em#机#/em##em#器#/em##em#学#/em##em#习#/em#基本语法"]}}]}}"

Array ( [query] => Array ( [match] => Array ( [title] => Array ( [query] => Python+Spark2.0+hadoop学习笔记——Spark ML Pipeline机器学习流程 ) ) ) [highlight] => Array ( [fields] => Array ( [title] => stdClass Object ( ) ) [pre_tags] => #em# [post_tags] => #/em# ) [from] => 0 )

Array ( [query] => Array ( [bool] => Array ( [must] => Array ( [0] => Array ( [match] => Array ( [title] => Array ( [query] => Python+Spark2.0+hadoop学习笔记——Spark ML Pipeline机器学习流程 ) ) ) ) [must_not] => Array ( [0] => Array ( [term] => Array ( [cate1] => 电子书籍 ) ) ) ) ) [highlight] => Array ( [fields] => Array ( [title] => stdClass Object ( ) ) [pre_tags] => #em# [post_tags] => #/em# ) [size] => 5 [from] => 0 )

string(3039) "{"took":11,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":7960,"relation":"eq"},"max_score":39.1793,"hits":[{"_index":"likecs_down_db","_type":"_doc","_id":"68980","_score":39.1793,"_source":{"id":"68980","title":"LearningNotes\u5b66\u4e60\u7b14\u8bb0 v1.0","spidertime":"1623080606","contenttime":"1672766683","tag":"Learning|Notes|\u5b66\u4e60\u7b14\u8bb0","cate1":"\u6e90\u7801\u4e0b\u8f7d","cate2":"\u8f6f\u4ef6\u5f00\u53d1","cate3":"java\u6e90\u7801","attr1":"372MB"},"highlight":{"title":["LearningNotes#em#学#/em##em#习#/em##em#笔#/em##em#记#/em# v1.0"]}},{"_index":"likecs_down_db","_type":"_doc","_id":"10511","_score":39.1793,"_source":{"id":"10511","title":"LearningNotes\u5b66\u4e60\u7b14\u8bb0 v1.0","spidertime":"1622877629","contenttime":"1675618049","tag":"Learning|Notes|\u5b66\u4e60\u7b14\u8bb0","cate1":"\u6e90\u7801\u4e0b\u8f7d","cate2":"\u8f6f\u4ef6\u5f00\u53d1","cate3":"java\u6e90\u7801","attr1":"372MB"},"highlight":{"title":["LearningNotes#em#学#/em##em#习#/em##em#笔#/em##em#记#/em# v1.0"]}},{"_index":"likecs_down_db","_type":"_doc","_id":"69961","_score":37.583344,"_source":{"id":"69961","title":"\u81ea\u5b66Python \u7f16\u7a0b\u57fa\u7840\u5b66\u4e60\u7b14\u8bb0 \u9ad8\u6e05pdf\u7248","spidertime":"1623125064","contenttime":"1670900605","pageimage":"https:\/\/img.jbzj.com\/do\/uploads\/litimg\/190403\/1J6102OC9.jpg","tag":"\u81ea\u5b66Python|\u7f16\u7a0b\u57fa\u7840","cate1":"\u7535\u5b50\u4e66\u7c4d","cate2":"\u7f16\u7a0b\u5f00\u53d1","cate3":"Python\u7535\u5b50\u4e66","attr1":"16.1MB"},"highlight":{"title":["自#em#学#/em##em#Python#/em# 编#em#程#/em#基础#em#学#/em##em#习#/em##em#笔#/em##em#记#/em# 高清pdf版"]}},{"_index":"likecs_down_db","_type":"_doc","_id":"34848","_score":37.583344,"_source":{"id":"34848","title":"\u81ea\u5b66Python \u7f16\u7a0b\u57fa\u7840\u5b66\u4e60\u7b14\u8bb0 \u9ad8\u6e05pdf\u7248","spidertime":"1622959697","contenttime":"1673303418","pageimage":"https:\/\/img.jbzj.com\/do\/uploads\/litimg\/190403\/1J6102OC9.jpg","tag":"\u81ea\u5b66Python|\u7f16\u7a0b\u57fa\u7840","cate1":"\u7535\u5b50\u4e66\u7c4d","cate2":"\u7f16\u7a0b\u5f00\u53d1","cate3":"Python\u7535\u5b50\u4e66","attr1":"16.1MB"},"highlight":{"title":["自#em#学#/em##em#Python#/em# 编#em#程#/em#基础#em#学#/em##em#习#/em##em#笔#/em##em#记#/em# 高清pdf版"]}},{"_index":"likecs_down_db","_type":"_doc","_id":"69671","_score":36.63583,"_source":{"id":"69671","title":"\u5927\u6570\u636e\u5b66\u4e60\u7b14\u8bb0(Hadoop\u5b66\u4e60\u6587\u6863) \u4e2d\u6587pdf\u9ad8\u6e05\u7248","spidertime":"1623081540","contenttime":"1625046203","pageimage":"https:\/\/img.jbzj.com\/do\/uploads\/litimg\/200512\/1I0462K048.jpg","tag":"\u5927\u6570\u636e|Hadoop|\u624b\u518c|Hadoop\u5927\u6570\u636e","cate1":"\u7535\u5b50\u4e66\u7c4d","cate2":"\u670d\u52a1\u5668","attr1":"20.5MB"},"highlight":{"title":["大数据#em#学#/em##em#习#/em##em#笔#/em##em#记#/em#(#em#Hadoop#/em##em#学#/em##em#习#/em#文档) 中文pdf高清版"]}}]}}"

情况一：二元分类

这部分使用的数据集是判断网页是暂时的还是长青的。因为涉及到了文本的信息，所以需要进行文本的数字化和向量化。

在这部分中，机器学习分为三个部分，第一部分是建立机器学习流程pipeline，第二部分是训练，第三部分是预测。

在建立机器学习流程pipeline中包含4个阶段，如下所示：

StringIndexer：将文字的分类特征转换为数字。

OneHotEncoder：将一个数字的分类特征字段转为多个字段。

VectorAssembler：将所有的特征字段整合成一个Vector字段。

DesionTreeClassifier：进行训练并且产生模型。

训练过程是指“训练数据DataFrame”使用pipeline.fit()进行训练，然后产生pipelineModel模型。

预测过程是指“新数据DataFrame”使用pipelineModel.transform()进行预测，预测完成后会产生“预测结果DataFrame”。

这部分先使用DecisionTree Classifier Model进行预测，代码如下：

global Path
if sc.master[0:5]=="local":
Path="file:/home/jorlinlee/pythonwork/PythonProject/"
else:
Path="hdfs://master:9000/user/jorlinlee"

创建row_dr DataFrame

row_df=sqlContext.read.format("csv").option("header","true").option("delimiter","\t").load(Path+"data/train.tsv")

编写DataFrames UDF 用户自定义函数（将"?"转换为"0"）

from pyspark.sql.functions import udf

def replace_question(x):

return("0" if x=="?" else x)

replace_question=udf(replace_question)

将string数据类型转换为double数据类型

from pyspark.sql.functions import col

import pyspark.sql.types

df=row_df.select(

['url','alchemy_category'] +

[replace_question(col(colimn)).cast("double").alias(column)

for column in row_df.columns[4:]])

将数据分成train_df与test_df

train_df,test_df=df.randomSplit([0.7,0.3])
train_df.cache()
test_df.cache()

使用StringIndexer：将字符串分类特征字段转换为数值

from pyspark.ml.feature import StringIndexer

categoryIndexer=StringIndexer(inputCol='alchemy_category',outputCol="alchemy_category_Index")

categoryTransformer=categoryIndexer.fit(df)

df1=categoryTransformer.transform(train_df)

使用OneHotEncoder：将一个数值的分类特征字段转换为多个字段的Vector

from pyspark.ml.feature import OneHotEncoder

encoder=OneHotEncoder(dropLast=false,inputCol='alchemy_category_Index',outputCol="alchemy_category_IndexVec")

df2=encoder.transform(df1)

使用VectorAssembler：将多个特征字段整合成一个特征的Vector

from pyspark.ml.feature import VectorAssembler

assemblerInputs=['alchemy_category_IndexVec'] + row_df.columns[4:-1]

assembler=VectorAssembler(inputCols=assemblerInputs,outputCol="feature")

df3=assembler.transform(df2)

使用DecisionTreeClassifier二元分类

from pyspark.ml.classification import DecisionTreeClassifier

dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",impurity="gini",maxDepth=10,maxBins=14)

dt_model=dt.fit(df3)

进行预测

df4=dt_model.transform(df3)

以上是分解过程，下面使用pipeline流程组件

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

建立pipeline

stringIndexer=StringIndexer(inputCol='alchemy_category',
outputCol="alchemy_category_Index")
encoder=OneHotEncoder(dropLast=False,
inputCol='alchemy_category_Index',
outputCol="alchemy_category_IndexVec")
assemblerInputs=['alchemy_category_IndexVec']+row_df.columns[4:-1]
assembler=VectorAssembler(inputCols=assemblerInputs,outputCol="feature")
dt=DecisionTreeClassifier(labelCol="label",featuresCol="feature",impurity="gini",maxDepth=10,maxBins=14)
pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,dt])

查看pipeline阶段

pipeline.getStages()

使用pipeline进行数据处理与训练

pipelineModel=pipeline.fit(train_df)

查看训练完成后的决策树模型（第3个阶段会产生决策树模型）

pipelineModel.stages[3]

使用pipelineModel.transform进行预测

predicted=pipelineModel.transform(test_df)

评估模型的准确性（使用AUC）

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator=BinaryClassificationEvaluator(
rawPredictionCol="rawPrediction",
labelCol="label",
metricName="areaUnderROC")

auc=evaluator.evaluate(predicted)

auc

结果是：0.617

提出改进方案：

方案一：

使用TrainValidation进行训练验证找出最佳模型

from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

设置训练验证的参数

paramGrid=ParamGridBuilder().addGrid(dt.impurity,["gini","entropy"]).addGrid(dt.maxDepth,[5,10,15]).addGrid(dt.maxBins,[10,15,20]).build()

创建TrainValidationSplit

tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)

建立tvs_pipeline

tvs_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,tvs])

使用tvs_pipeline流程进行训练验证

tvs_pipelineModel=tvs_pipeline.fit(train_df)

评估最佳模型AUC

predictions=tvs_pipelineModel.transform(test_df)

auc=evaluator.evaluate(predictions)

auc

结果：0.656

方案二：使用crossValidation交叉验证找出最佳模型

from pyspark.ml.tuning import CrossValidator

建立交叉验证的CrossValidator（与之前的paramGrid有联系）

cv=CrossValidator(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)

建立交叉验证的cv_pipeline

cv_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,cv])

训练模型

cv_pipelineModel=cv_pipeline.fit(train_df)

评估最佳模型AUC

predictions=cv_pipelineModel.transform(test_df)

auc=evaluator.evaluate(predictions)

auc

结果：0.658

方案三：使用随机森林RandomForestClassifier分类器

from pyspark.ml.classification import RandomForestClassifier

建立随机森林分类模型

rf=RandomForestClassifier(labelCol="label",featuresCol="feature",numTrees=10)

建立随机森林分类pipeline

rfpipeline=Pipeline(stages=[stringIndexer,encoder,assembler,rf])

对随机森林模型进行训练

rfpipelineModel=rfpipeline.fit(train_df)

使用模型进行预测

rfpredicted=rfpipelineModel.transform(test_df)

auc=evaluator.evaluate(rfpredicted)

auc

结果：0.738

使用RandomForestClassifier TrainValidation找出最佳模型

from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

paramGrid=ParamGridBuilder().addGrid(rf.impurity,['gini','entropy']).addGrid(rf.maxDepth,[5,10,15]).addGrid(rf.maxBins,[10,15,20]).addGrid(rf.numTrees,[10,20,30]).build()

rftvs=TrainValidationSplit(estimator=rf,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)

rftvs_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,rftvs])

rftvs_pipelineModel=rftvs_pipeline.fit(train_df)

rftvspredcitions=rftvs_pipelineMode.transform(test_df)

auc=evaluator.evaluate(rftvspredictions)

auc

结果是：0.760

使用crossValidation找出最佳模型

from pyspark.ml.tuning import CrossValidator,ParamGridBuilder

rfcv=CrossValidator(estimator=rf,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)

rfcv_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,rfcv])

rfcv_pipelineModel=rfcv_pipeline.fit(train_df)

rfcvpredictions=rfcv_pipelineModel.transform(test_df)

auc=evaluator.evaluate(rfcvpredictions)

auc

结果：0.762

情况二：多元分类

这部分使用的数据是森林覆盖树种数据。数据中没有涉及到文本数据，因此在建立pipeline时使用VectorAssembler和相应的机器学习模型就好了。

读取数据

global path
if sc.master[0:5]=="local":
Path="file:/home/jorlinlee/pythonwork/PythonProject/"
else:
Path="hdfs://master:9000/user/jorlinlee/"

rawData=sc.textFile(Path+"data/covtype.data")
lines=rawData.map(lambda x:x.split(","))

因为这份数据最后需要转换成DataFrame形式，但是这份数据没有字段名，因此需要人工增加

from pyspark.sql.types import StringType, StructField, StructType

fields=[StructField("f"+str(i), StringType(), True) for i in range(fieldnum)]

schema=StructType(fields)

构造DataFrame

covtype_df=spark.createDataFrame(lines,schema)

将string格式转换为double

from pyspark.sql.functions import col
covtype_df=covtype_df.select([col(column).cast("double").alias(column) for column in covtype_df.columns])

创建特征字段List

featuresCols=covtype_df.columns[:54]

创建label字段

covtype_df=covtype_df.withColumn("label",covtype_df["f54"] - 1).drop("f54")

将数据分成train_df与test_df

train_df,test_df=covtype_df.randomSplit([0.7,0.3])

train_df.cache()

test_df.cache()

导入模块

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

建立pipeline

vectorAssembler=VectorAssembler(inputCols=featuresCols,outputCol="features")
dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",maxDepth=5,maxBins=20)
dt_pipeline=Pipeline(stages=[vectorAssembler,dt])

使用pipeline进行训练

pipelineModel=dt_pipeline.fit(train_df)

使用pipelinModel进行预测

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")

predicted=pipelineModel.transform(test_df)

accuracy=evaluator.evaluate(predictions)

accuracy

结果：0.703

使用TrainValidation进行训练验证找出最佳模型

from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

paramGrid=ParamGridBuilder().addGrid(dt.impurity,["gini","entropy"]).addGrid(dt.maxDepth,[10,15,25]).addGrid(dt.maxBins,[30,40,50]).build()

tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)

tvs_pipeline=Pipeline(stages=[vectorAssembler,tvs])

tvs_pipelineModel=tvs_pipeline.fit(train_df)

predictions=tvs_pipelineModel.transform(test_df)

accuracy=evaluator.evaluate(predictions)
accuracy

结果：0.930

情况三：回归分析

这部分使用的是共享单车预测的数据。这部分在建立pipeline时使用VectorAssembler（将所有的特征字段整合成vector）、VectorIndexer（将不重复数值的数量小于等于maxCategories参数值所对应的字段视为分类字段，否则视为数值字段）和机器学习模型。

代码如下：

导入数据

global Path
if sc.master[0:5]=="local":
Path="file:/home/jorlinlee/pythonwork/PythonProject/"
else:
Path="hdfs://master:9000/user/jorlinlee/"

hour_df=spark.read.format('csv').option("header",'true').load(Path+"data/hour.csv")

去掉不重要的字段

hour_df=hour_df.drop("instant").drop("dteday").drop("yr").drop("casual").drop("registered")

数据类型变换

from pyspark.sql.functions import col

hour_df=hour_df.select([col(column).cast("double").alias(column) for column in hour_df.columns])

建立pipeline流程

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,VectorIndexer,VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

featuresCols=hour_df.columns[:-1]

vectorAssembler=VectorAssembler(inputCols=featuresCols,outputCol="aFeatures")vectorIndexer=VectorIndexer(inputCol="aFeatures",outputCol="features",maxCategories=24)

dt=DecisionTreeRegressor(labelCol="cnt",featuresCol="features")

dt_pipeline=Pipeline(stages=[vectorAssembler,vectorIndexer,dt])

分割数据

train_df,test_df=hour_df.randomSplit([0.7,0.3])

train_df.cache()

test_df.cache()

使用pipeline进行数据处理与训练

dt_pipelineModel=dt_pipeline.fit(train_df)

predicted_df=dt_pipelineModel.transform(test_df)

评估模型的准确率

from pyspark.ml.evaluation import RegressionEvaluator

evaluator=RegressionEvaluator(labelCol='cnt',predictionCol='prediction',metricName="rmse")

predicted_df=dt_pipelineModel.transform(test_df)

rmse=evaluator.evaluate(predicted_df)

rmse

结果：95.617

使用TrainValidation进行训练验证找出最佳模型

from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

paramGrid=ParamGridBuilder().addGrid(dt.maxDepth,[5,10,15,25]).addGrid(dt.maxBins,[25,35,45,50]).build()

tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)

tvs_pipeline=Pipeline=Pipeline(stages=[vectorAssembler,vectorIndexer,tvs])

tvs_pipelineModel=tvs_pipeline.fit(train_df)

predictions=tvs_pipelineModel.transform(test_df)

rmse=evaluator.evaluate(predictions)

rmse

结果：78.285

使用crossValidation进行交叉验证找出最佳模型

from pyspark.ml.tuning import CrossValidator
from pyspark.ml import Pipeline

cv=CrossValidator(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)

cv_pipeline=Pipeline(stages=[vectorAssembler,vectorIndexer,cv])

cv_pipelineModel=cv_pipeline.fit(train_df)

predictions=cv_pipelineModel.transform(test_df)

rmse=evaluator.evaluate(predictions)

rmse

结果：78.457

使用GBT Regression（梯度提升树，一次只产生一棵决策树，再根据前一个决策树的结果决定如何产生下一个决策树）

from pyspark.ml.regression import GBTRegressor

gbt=GBTRegressor(labelCol='cnt',featuresCol='features')

gbt_pipeline=Pipeline(stages=[vectorAssembler,vectorIndexer,gbt])

gbt_pipelineModel=gbt_pipeline.fit(train_df)

predicted_df=gbt_pipelineModel.transform(test_df)

rmse=evaluator.evaluate(predicted_df)

rmse

结果：75.699

使用GBT Regression CrossValidation找出最佳模型

from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

paramGrid=ParamGridBuilder().addGrid(gbt.maxDepth,[5,10]).addGrid(gbt.maxBins,[25,40]).addGrid(gbt.maxIter,[10,50]).build()

cv=CrossValidator(estimator=gbt,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)

cv_pipeline=Pipeline(stages=[vectorAssembler,vectorIndexer,cv])

cv_pipelineModel=cv_pipeline.fit(train_df)

predicted_df=cv_pipelineModel.transform(test_df)

evaluator=RegressionEvaluator(labelCol='cnt',predictionCol='prediction',metricName="rmse")

rmse=evaluator.evaluate(predicted_df)

rmse

结果：70.732