【发布时间】:2019-12-29 11:19:21
【问题描述】:
我已经抓取了一个网站来提取鞋子和衣服的价格、它们的图像 ID、图像 URL 和其他一些特征,我成功地将数据框写入 csv 文件,但我意识到数据框将每个特征写入不同的行,而它们必须聚集在一行中,我在下面显示了我的 csv 文件的示例输出。 有关如何更改代码的任何建议??
from bs4 import BeautifulSoup
import requests
import re
import csv
import pandas as pd
import os
import urllib.request
df = pd.DataFrame(columns = ['PostID','Description', 'Kind', 'Price', 'ImageID', 'ImageURL'])
def scraping():
global h , df
with open("/home/user/Documents/file.txt") as f:
urls = f.readlines()
urls = ([s.strip('\n') for s in urls ])
code_list = []
for url in urls:
code = url.split('/')[-1]
code_list.append(code)
df = df.append({'PostID': code}, ignore_index=True)
for br in soup.find_all("br"):
br.replace_with("\n")
try:
description = soup.find('div', attrs={'class':'ui fluid card post-description'}).find('div', attrs={'class':'content'})
print(description.text)
df = df.append({'Description': description.text}, ignore_index=True)
item_list = []
items = soup.find_all('span', attrs={'class':'item__title'})
for i in items:
item_list.append(i.text)
item_list.pop(0)
value_list=[]
values = soup.find_all('div', attrs={'class':'value'})
for v in values:
value_list.append(v.text)
my_dictionary = {}
for i in range(1,3):
my_dictionary[item_list[i]] = value_list[i]
df = df.append({'Kind':my_dictionary['نوع آگهی'] }, ignore_index=True)
df = df.append({'Price': my_dictionary['قیمت']}, ignore_index=True)
imageresult = []
path = '/home/user/images'
images = soup.find_all('img')
for img in images:
imgID = img.get('src').split('/')[-1]
df = df.append({'ImageID': imgID}, ignore_index=True)
df = df.append({'ImageURL': img.get('src')}, ignore_index=True)
urllib.request.urlretrieve(img.get('src'), os.path.join(my_path, os.path.basename(img.get('src'))))
print(imgID + img.get('src'))
else:
break
except:
print("your URL is invalid :" + url)
scraping()
df.to_csv('divartest14.csv', index = False , encoding = 'utf-8')
PostID Description Kind Price ImageID
QXZ5RjZj
adidas shoes
feminine
100$
QXZ5RjZj.jpg
【问题讨论】:
标签: python pandas csv web-scraping export-to-csv