1.
配置文件data-config.xml定义了数据库的基本配置,以及导出数据的映射规则,即导出数据库表中对应哪些字段的值,以及对特定字段的值做如何处理
</pre><p><pre name="code" class="html"><dataConfig> <dataSource name="jdbc" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://172.0.8.249:5606/marketing_db_saved?zeroDateTimeBehavior=convertToNull" user="developer" password="sedept@shiyanjun.cn" /> <document name="mkt_data"> <entity name="marketing_data" pk="id" query="select * from marketing_data where id between ${dataimporter.request.offset} and ${dataimporter.request.offset}+1000000" deltaQuery="select * from marketing_data where updated_at > '${dih.last_index_time}'" transformer="RegexTransformer"> <field column="id" name="id" /> <field column="domain" name="domain" /> <field column="alex_rank" name="alex_rank" /> <field column="server_port" name="server_port" /> <field column="cert_validity_notBefore" name="cert_validity_notBefore" /> <field column="cert_validity_notAfter" /> <field column="cert_validity_notAfter_yyyyMMdd" regex="(.*?)\s+.*" name="cert_validity_notAfter_yyyyMMdd" sourceColName="cert_validity_notAfter" /> <field column="cert_issuer_brand" name="cert_issuer_brand" /> <field column="cert_validation" name="cert_validation" /> <field column="cert_isMultiDomain" name="cert_isMultiDomain" /> <field column="cert_issuer_brand_isXRelated" name="cert_issuer_brand_isXRelated" /> <field column="cert_isWildcard" name="cert_isWildcard" /> <field column="cert_notAfter" name="cert_notAfter" /> <field column="special_ssl" name="special_ssl" /> <field column="competitor_logo" name="competitor_logo" /> <field column="segment" name="segment" /> </entity> </document> </dataConfig>
Solr的DIH暴露了请求中传递的变量 ${dataimporter.request.offset},也就是在请求的requestHandler中可以附带附加属性条件,例如,下面请求URL中的offset=5000000参数:
http://172.0.8.212:8080/seaarch-server/core0/dataimport?command=full-import&offset=5000000
另外,还有一个参数是很重要的,它决定着是否清除已经存在的索引数据,默认为clean=true,如果不想删除以前的索引数据,一定要在请求的URL中指定该属性为false,请求URL如下:
http://172.0.8.212:8080/seaarch-server/core0/dataimport?command=full-import&offset=5000000&clean=false
另外,索引完成后一半需要执行commit操作,将内存中索引数据持久化到文件系统,防止改变丢失,所以需要在请求的URL中增加commit=true,例如:
配置:
<dataConfig> <script><![CDATA[ function ReplaceLocAddId(row) { var loc_1 = row.get('loc').split('/deal/'); var loc_2 = loc_1[1].split('.html'); var id = loc_2[0]; row.put('id', id); //格式化时间. //var sdf = new java.text.SimpleDateFormat('yyyy-MM-dd HH:mm:ss'); //开始时间. row.put('startTime', com.sitech.util.DateUtils.parseMT(row.get('startTime'),null)); //结束时间. row.put('endTime', com.sitech.util.DateUtils.parseMT(row.get('endTime'),null)); //去掉折扣汉字. row.put('rebate', row.get('rebate').replace('折','')); return row; } ]]></script> <dataSource type="FileDataSource" encoding="utf-8" /> <document> <entity name="tuan" pk="loc" url="D:/solr/source_data/meituan_hao123.xml" processor="XPathEntityProcessor" forEach="/urlset/url" transformer="script:ReplaceLocAddId,DateFormatTransformer"> <field column="loc" xpath="/urlset/url/loc" commonField="true" /> <field column="city" xpath="/urlset/url/data/display/city" commonField="true" /> <field column="sort" xpath="/urlset/url/data/display/sort" commonField="true" /> <field column="title" xpath="/urlset/url/data/display/title" commonField="true" /> <field column="image" xpath="/urlset/url/data/display/image" commonField="true" /> <field column="value" xpath="/urlset/url/data/display/value" commonField="true" /> <field column="price" xpath="/urlset/url/data/display/price" commonField="true" /> <field column="rebate" xpath="/urlset/url/data/display/rebate" commonField="true" /> <field column="bought" xpath="/urlset/url/data/display/bought" commonField="true" /> <field column="startTime" xpath="/urlset/url/data/display/startTime" dateTimeFormat="yyyy-MM-dd HH:mm:ss" commonField="true" /> <field column="endTime" xpath="/urlset/url/data/display/endTime" dateTimeFormat="yyyy-MM-dd HH:mm:ss" commonField="true" /> </entity> </document> </dataConfig>
4、从oracle抽取数据建立索引
<dataConfig> <dataSource name="jdbc" driver="oracle.jdbc.driver.OracleDriver" url="jdbc:oracle:thin:@127.0.0.1:1522:ORCLLI" user="root" password="root"/> <document> <entity name="tm_details" query="select t.docid as id,t.tempid,t.cruser as userid,t.crtime,t.A_GZMC||t.A_XXMS||t.A_ZZPZZY as content from TM_DETAILS t where t.type=2 " transformer="ClobTransformer,HTMLStripTransformer,RegexTransformer,DateFormatTransformer"> <field column="ID" name="id" /> <field column="TEMPID" name="tempid" /> <entity name="template" query="select te.name from kmstemplate te where te.id=${tm_details.TEMPID}"> <field column="NAME" name="template"/> </entity> <entity name="user" query="select msg.name from tb_sys_loginmsg msg where msg.login_id='${tm_details.USERID}'" > <field column="NAME" name="cruser"/> </entity> <field column="CRTIME" name="crtime" dateTimeFormat="yyyy-MM-dd HH:mm:ss"/> <entity name="doc" query="select rtrim(d.doctitle,'.htm') as title from kmsdocument d where d.docid=${tm_details.ID}"> <field column="TITLE" name="title" clob="true"/> </entity> <field column="CONTENT" name="content" clob="true" stripHTML="true" regex="\\t|\r|\n" replaceWith=""/> </entity> </document> </dataConfig>
<dataConfig> <dataSource name="jdbc" driver="oracle.jdbc.driver.OracleDriver" url="jdbc:oracle:thin:@172.21.144.200:1522:ORCLLI" user="kms_user_js" password="kms_user_js"/> <document> <entity name="taocan" query="select t.docid||'e'||t.eid||'o'||t.ordernum||'n'||t.numgroup as id ,t.tempid,t.cruser as userid,t.crtime,t.faq_wt2 as title,da.faq_da2 as content from TM_DETAILS_LIST_FAQ_WT t join TM_DETAILS_LIST_FAQ_DA da on t.docid=da.docid and t.ordernum=da.ordernum and t.numgroup=da.numgroup and t.eid=da.eid" transformer="ClobTransformer,DateFormatTransformer"> <field column="ID" name="id" /> <entity name="template" query="select te.name from kmstemplate te where te.id=${taocan.TEMPID}"> <field column="NAME" name="template"/> </entity> <entity name="user" query="select msg.name from tb_sys_loginmsg msg where msg.login_id='${taocan.USERID}'" > <field column="NAME" name="cruser"/> </entity> <field column="CRTIME" name="crtime" dateTimeFormat="yyyy-MM-dd HH:mm:ss"/> <field column="TITLE" name="title" clob="true"/> <field column="CONTENT" name="content" clob="true"/> </entity> </document> </dataConfig>
5、mysql 和文件集成
<dataConfig> <dataSource name="jdbc" type="JdbcDataSource" driver="com.mysql.jdbc.Driver" batchSize="-1" url="jdbc:mysql://127.0.0.1:3306/test?characterEncoding=UTF-8" user="root" password="root"/> <dataSource name="file" type="FileDataSource" encoding="utf-8" /> <document> <entity pk="id" name="kms" dataSource="jdbc" query="SELECT id,docid,path FROM gx_kmsindex ORDER BY TIME ASC" deltaQuery="select id from gx_kmsindex where time>'${dih.last_index_time}' and type in ('add','update') ORDER BY TIME ASC" deletedPkQuery="select docid as id from gx_kmsindex where type='delete' and time>'${dih.last_index_time}' ORDER BY TIME ASC" deltaImportQuery="select path from gx_kmsindex where id='${dih.delta.id}' ORDER BY TIME ASC" transformer="DateFormatTransformer" > <entity dataSource="file" name="xml" url="${kms.path}" processor="XPathEntityProcessor" forEach="/datas/data/" transformer="DateFormatTransformer" > <field column="id" xpath="/datas/data/id" /> <field column="title" xpath="/datas/data/title" /> <field column="content" xpath="/datas/data/content" /> <field column="crtime" xpath="/datas/data/crtime" dateTimeFormat="yyyy-MM-dd HH:mm:ss"/> <field column="templateid" xpath="/datas/data/templateid" /> <field column="price" xpath="/datas/data/price" /> </entity> </entity> </document> </dataConfig>