【发布时间】:2016-07-14 05:19:42
【问题描述】:
这是我在 stackoverflow 上的第一个问题 我正在玩 Scrapy,一旦从 Scrapy 获取链接,我就被困在我想将数据库链接更新为已扫描 = 1 的地方。
# -*- coding: utf-8 -*-
import scrapy
import scrapy.http
from scrapy.spiders import CrawlSpider, Rule
from Testing.items import Testing100Item
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Response
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.responsetypes import Response
import re
import MySQLdb
from MySQLdb.cursors import SSCursor
import MySQLdb.cursors
##This is the connector to Database to Read New Domains
def getdomainsfromdb():
try:
conn = MySQLdb.connect(
host="localhost",
user="root",
passwd="root",
db="Testing",
cursorclass = MySQLdb.cursors.SSCursor)
cursor = conn.cursor()
query = """
SELECT domain_id, url, id_sitemap_links
from Sitemap_links
where scanned = 0;"""
cursor.execute(query)
return cursor.fetchall()
except Exception, e:
print e
##This will update the scanned to 1
def scanned(id_sitemap_links):
try:
conn = MySQLdb.connect(
host="localhost",
user="root",
passwd="root",
db="Testing",
cursorclass = MySQLdb.cursors.SSCursor)
cursor = conn.cursor()
query = """
UPDATE Sitemap_links
set scanned = 1
where id_sitemap_links = '%s' """
cursor.execute(query, (int(id_sitemap_links),))
except Exception, e:
print e
class Testing100Spider(scrapy.Spider):
name = "testing100"
#allowed_domains = []
#start_urls = ()
def start_requests(self):
for domain_id, url, id_sitemap_links in getdomainsfromdb():
yield Request(url, callback=self.parse, meta={'id_sitemap_links': id_sitemap_links})
def parse(self, response):
# domain_id = response.meta['domain_id']
id_sitemap_links = response.meta['id_sitemap_links']
scanned(id_sitemap_links)
print id_sitemap_links
# def parse(self, response):
# domain_id = Request(0)
# item = Testing100Item()
# #items = []
此时我可以从 getdomainsfromdb() 函数中读取域,但我无法更新 scrapy 正在处理的域的 id.. 我可以打印 id_sitemap_links 但 SQL 没有更新..
我在这里缺少什么? 提前谢谢你
【问题讨论】: