【发布时间】:2019-03-02 18:18:46
【问题描述】:
我正在尝试使用 BS4 抓取一些数据,然后将其写入 CSV。我希望用 CSV 编写的模式类似于 this website。
所以它更像是 1: header 3 然后是相应的 table 然后 header3 和 table 依此类推... 但我得到了这种输出:
Total rainfall in millimetres for SherkinIsland
Mean temperature in degrees Celsius for SherkinIsland
Mean 10cm soil temperature for SherkinIsland at 0900 UTC
Global Solar Radiation in Joules/cm2 for SherkinIsland
Potential Evapotranspiration (mm) for SherkinIsland
Evaporation (mm) for SherkinIsland
Notes on the Data
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,199.1,67.2,116.6,129.3,93.0,17.2,48.8,62.5,82.1,,,,815.8
2017,66.7,78.5,132.7,14.6,39.2,112.3,89.9,78.6,150.8,115.5,51.9,147.5,1078.2
2016,185.8,113.0,61.5,68.8,59.4,61.5,69.7,111.1,111.1,64.4,43.3,78.3,1027.9
2015,106.6,78.0,88.9,18.5,110.0,77.4,127.0,87.0,121.2,52.8,107.7,292.7,1267.8
mean,132.7,101.4,94.7,73.7,73.7,75.1,78.0,88.3,92.4,127.6,120.1,130.3,1188.0
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,8.2,6.1,5.7,9.2,12.1,15.4,17.1,15.0,13.6,,,,11.4
2017,8.1,8.2,9.2,9.8,12.2,14.0,14.9,14.6,13.5,12.6,9.4,8.1,11.2
2016,8.4,7.0,7.5,8.5,12.0,14.3,14.4,15.2,14.5,12.3,8.0,9.5,11.0
2015,7.5,6.5,7.7,9.4,10.9,12.9,14.2,14.3,13.8,12.3,11.2,10.3,10.9
mean,7.5,7.5,8.4,9.4,11.7,13.9,15.5,15.7,14.3,12.0,9.5,8.0,11.1
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,6.9,4.7,5.1,9.5,13.4,17.3,19.0,16.2,n/a,,,,11.6
2017,7.5,7.9,8.4,10.2,12.9,15.4,16.2,15.3,13.4,12.3,8.5,6.8,11.3
2016,7.4,5.9,6.6,8.5,13.0,15.6,15.8,15.8,14.6,11.8,7.7,8.8,11.0
2015,6.6,5.3,6.8,9.3,11.7,14.5,14.8,14.7,13.1,11.2,10.6,9.6,10.7
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,9023,15831,29709,42026,58669,67070,65526,44784,29711,,,,362349
2017,8345,14868,28307,43479,57060,59325,57794,46218,33526,15375,11157,7084,382538
2016,7262,16452,27956,48481,60218,56262,53776,48503,25866,19137,12859,5660,382432
2015,8882,13475,30056,50190,55679,57207,57047,49551,33798,19483,8962,5121,389451
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,21.8,28.5,34.9,49.9,76.3,98.8,104.6,64.5,42.9,,,,522.2
2017,20.6,25.2,40.5,59.4,75.1,80.5,79.1,63.5,46.3,26.2,38.7,18.7,573.8
2016,20.8,27.3,39.7,61.4,77.3,81.1,73.7,68.6,43.9,39.0,23.5,21.0,577.3
2015,23.5,21.0,38.1,59.8,67.1,73.3,76.1,66.2,53.0,34.4,25.6,24.1,562.2
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
2018,30.5,41.0,55.4,81.0,116.3,143.1,147.9,96.8,64.3,,,,776.3
2017,27.1,37.8,64.0,88.6,117.8,127.9,122.2,97.5,71.3,39.2,46.4,24.6,864.4
2016,28.7,41.0,61.1,96.8,118.9,122.4,112.7,104.8,64.3,52.8,30.3,26.7,860.5
2015,32.7,31.1,60.5,95.8,113.2,115.7,120.8,101.4,75.9,47.2,35.1,32.8,862.2
mean,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a,n/a
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total
2018,226,262,303,188,115,38,12,30,N/A,N/A,N/A,N/A,N/A
2017,228,206,195,170,105,55,34,37,63,90,183,230,1596
2016,220,247,247,210,112,44,44,28,41,99,226,185,1702
2015,247,253,243,182,143,82,48,46,57,100,130,162,1693
我的源代码是:
import time
from os import getcwd
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas
import time, re
import csv
import uuid
class metEiren():
def __init__(self):
print("hurray33")
global downloadDir
downloadDir = ""
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList", 2)
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.dir", downloadDir)
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
options = Options()
options.add_argument("--headless")
global driver
driver = webdriver.Firefox(firefox_profile=fp,firefox_options=options)
driver.get("https://www.met.ie/climate/available-data/monthly-data")
verificationErrors = []
accept_next_alert = True
def scrap(self):
driver.get("https://www.met.ie/climate/available-data/monthly-data")
driver.execute_script("window.scrollTo(0, 1000)")
wait = WebDriverWait(driver, 10)
link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "Sherkin Island")))
link.click()
time.sleep(2)
uFileName = str(uuid.uuid4())
filname = downloadDir + uFileName + ".csv"
soup = BeautifulSoup(driver.page_source, 'html.parser')
headerList = []
tableContentList = []
for h in soup.find_all('h3'):
print(h.text)
headerList.append(h.text)
for table in soup.find_all('table'):
for row in table.find_all('tr'):
list_of_cells = []
for hd in row.find_all(['th','td']):
list_of_cells.append(hd.text.strip())
tableContentList.append(list_of_cells)
with open(filname, 'w', newline='') as f:
writer = csv.writer(f, delimiter = ',', quoting = csv.QUOTE_NONE,escapechar=',',lineterminator='\n')
length1 = len(headerList)
length2 = len(tableContentList)
for i in range(len(headerList)):
writer.writerows([headerList[i].strip(',').split(',')])
writer.writerows(tableContentList[s] for s in range (len(tableContentList)))
if __name__ == '__main__':
obj = metEiren()
obj.scrap()
任何帮助将不胜感激,谢谢
【问题讨论】:
-
这表明您已成功按正确顺序抓取数据。你能澄清一下它应该如何写入csv吗?你想要的输出是什么?
-
显然只需要重新排序。您能否展示一下您的
headerList和tableContentList的外观示例?我认为在您写入 csv 的最后一个for循环中可能存在一个小错误。 -
@Ajax1234 是的,我已经成功抓取了它,但是在写入 csv 时,我无法以这种格式创建文件 met.ie/climate/available-data/monthly-data。这意味着 headerlist 中包含 h3 和 tablecontentlist available table。书写格式应为 h3 --> th --> td --> h3-->above next table --> th --> td 以此类推
-
@panktijk 你可以在这个网址上看到输出codeshare.io/G70M7j
标签: python python-3.x csv web-scraping beautifulsoup