【问题标题】:Python 3 - Pandas, DataFrame, ValueError: cannot set a row with mismatched columnsPython 3 - Pandas、DataFrame、ValueError:无法设置列不匹配的行
【发布时间】:2018-12-06 02:22:09
【问题描述】:

我看到其他人也有这个问题,但我还没有从其他帖子中找到可行的解决方案,也许是因为我还没有正确实施它。我正在为一个职位发布网站制作刮板,这是我正在使用/遇到问题的代码部分:

import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time



# URL of specific job search
# FUTURE TO DO - Break it up into editable variables based on URL structure
URL = 'https://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=Canada&jt=fulltime'
# conducting a request of the stated URL above:
page = requests.get(URL)
# specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
# printing soup in a more structured tree format that makes for easier reading
# print(soup.prettify())



# Extract job title
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.findAll(name='div', attrs={'class':'row'}):
        for a in div.findAll(name='a', attrs={'data-tn-element':'jobTitle'}):
            jobs.append(a['title'])
    return(jobs)
extract_job_title_from_result(soup)



# Extract company
def extract_company_from_result(soup): 
    companies = []
    for div in soup.findAll(name='div', attrs={'class':'row'}):
        company = div.findAll(name='span', attrs={'class':'company'})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.findAll(name='span', attrs={'class':'result-link-source'})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)
extract_company_from_result(soup)



# Extract location
def extract_location_from_result(soup): 
    locations = []
    spans = soup.findAll('span', attrs={'class': 'location'})
    for span in spans:
        locations.append(span.text)
    return(locations)
extract_location_from_result(soup)



# Extract salary
def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.findAll(name='div', attrs={'class':'row'}):
        try:
          salaries.append(div.find('nobr').text)
        except:
            try:
                div_two = div.find(name='div', attrs={'class':'sjcl'})
                div_three = div_two.find('div')
                salaries.append(div_three.text.strip())
            except:
                salaries.append('Nothing_found')
    return(salaries)
extract_salary_from_result(soup)



# Extract job summary
# FUTURE TO DO - Extract full job description by each job page posting 
# ie. going through the link
def extract_summary_from_result(soup): 
    summaries = []
    spans = soup.findAll('span', attrs={'class': 'summary'})
    for span in spans:
        summaries.append(span.text.strip())
    return(summaries)
extract_summary_from_result(soup)



# Max results per city, which cities, and an output for the data
max_results_per_city = 10
city_set = ['Canada'] # 'New+York','Chicago','San+Francisco', 'Austin', 'Seattle', 'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami', 'Washington+DC', 'Boulder']
columns = ['city', 'job_title', 'company_name', 'location', 'summary', 'salary']
sample_df = pd.DataFrame(columns = columns)



#scraping code:
for city in city_set:
    for start in range(0, max_results_per_city, 10):
        page = requests.get('http://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=' + str(city) + '&jt=fulltime')  # + '&start=' + str(start))
        time.sleep(1)  #ensuring at least 1 second between page grabs

        soup = BeautifulSoup(page.text, 'lxml')
        for div in soup.find_all(name='div', attrs={'class':'row'}):
    #creating an empty list to hold the data for each posting
            job_post = []
    #append city name
            job_post.append(city)
    #grabbing job title
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                job_post.append(a['title']) 
    #grabbing company name
                company = div.find_all(name='span', attrs={'class':'company'}) 
                if len(company) > 0:
                    [job_post.append(b.text.strip()) for b in company]
                else:
                    [job_post.append(span.text) for span in div.find_all(name='span', attrs={'class':'result-link-source'})]
    #grabbing location name
                [job_post.append(span.text) for span in div.findAll('span', attrs={'class': 'location'})]
    #grabbing summary text
                [job_post.append(span.text.strip()) for span in div.findAll('span', attrs={'class': 'summary'})]
    #grabbing salary
                div_two = div.find(name='div', attrs={'class':'salarySnippet'})
                job_post.append(div_two.text.strip() if div_two else 'Nothing found')
    #appending list of job post info to dataframe at index num
                sample_df.loc[len(sample_df) + 1] = job_post

#saving sample_df as a local csv file — define your own local path to save contents 
sample_df.to_csv('[filepath].csv', encoding='utf-8')

我的第二行或最后一行似乎有问题。收到错误:

    Traceback (most recent call last):
  File "script.py", line 128, in <module>
    sample_df.loc[len(sample_df) + 1] = job_post
  File "C:\Users\...Python\Python36\lib\site-packages\pandas\core\indexing.py", line 194, in __setitem__
    self._setitem_with_indexer(indexer, value)
  File "C:\Users\...\Python\Python36\lib\site-packages\pandas\core\indexing.py", line 439, in _setitem_with_indexer
    raise ValueError("cannot set a row with "
ValueError: cannot set a row with mismatched columns

我看到了一些使用 .append 而不是 .DataFrame 的解决方案,但是我收到一个错误,即 Pandas 不使用 .append 或类似的东西。有什么建议么?

这是我去年使用的旧代码:

https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b

提前致谢!

【问题讨论】:

  • 你也应该在你的代码中包含所有的导入(这样其他人可以很容易地尝试)
  • @SruthiV 谢谢,我已经更新以显示我的完整代码,以及我收到的完整错误消息。想法?

标签: python python-3.x pandas web-scraping beautifulsoup


【解决方案1】:

我无法重现该错误。我添加了依赖项(pandastime)并对未指定的对象(city_setmax_results_per_city)做了一些假设,然后我得到了一个包含所有条目的 DataFrame。我更改了一些薪水代码,因为网站的结构似乎发生了变化。不过,我没有运行任何扩展测试。

import pandas as pd
import time
columns = ['city', 'job_title', 'company_name', 'location', 'summary', 'salary']
sample_df = pd.DataFrame(columns = columns)
city_set = ('Toronto, ON', 'Calgary, AB', 'Montréal, QC')
max_results_per_city = 30

#scraping code:
for city in city_set:
    for start in range(0, max_results_per_city, 10):
        page = requests.get('http://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=' + str(city) + '&jt=fulltime')  # + '&start=' + str(start))
        time.sleep(1)  #ensuring at least 1 second between page grabs

        soup = BeautifulSoup(page.text, 'lxml')
        for div in soup.find_all(name='div', attrs={'class':'row'}):
    #creating an empty list to hold the data for each posting
            job_post = []
    #append city name
            job_post.append(city)
    #grabbing job title
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                job_post.append(a['title']) 
    #grabbing company name
                company = div.find_all(name='span', attrs={'class':'company'}) 
                if len(company) > 0:
                    [job_post.append(b.text.strip()) for b in company]
                else:
                    [job_post.append(span.text) for span in div.find_all(name='span', attrs={'class':'result-link-source'})]
    #grabbing location name
                [job_post.append(span.text) for span in div.findAll('span', attrs={'class': 'location'})]
    #grabbing summary text
                [job_post.append(span.text.strip()) for span in div.findAll('span', attrs={'class': 'summary'})]
    #grabbing salary
                div_two = div.find(name='div', attrs={'class':'salarySnippet'})
                job_post.append(div_two.text.strip() if div_two else 'Nothing found')
    #appending list of job post info to dataframe at index num
                sample_df.loc[len(sample_df) + 1] = job_post

【讨论】:

  • 感谢您的回复。我仍然收到相同的错误消息。我将更新我的原始问题以包含完整的代码(以及您的编辑),包括依赖项。有什么想法吗?
  • 另外,当我注释掉底部附近的 .loc 行并尝试将其输出到 .csv 时,创建的 csv 文件是空白的,减去列标题。
【解决方案2】:

好吧,我没有得到主要的解决方案,但我只使用 .writerow 做了一个解决方法,它也能正常工作。稍后我将使用数据框。谢谢大家!

#scraping code:
with open('output.csv', 'a', newline='') as f_output:
    csv_output = csv.writer(f_output) #delimiter=",")
    for city in city_set:
        for start in range(0, max_results_per_city, 10):
            page = requests.get('http://www.indeed.ca/jobs?q=data+scientist,+data+analyst,+python&l=' + str(city) + '&jt=fulltime')  # + '&start=' + str(start))
            time.sleep(1)  #ensuring at least 1 second between page grabs

            soup = BeautifulSoup(page.text, 'lxml')
            for div in soup.find_all(name='div', attrs={'class':'row'}):
    #creating an empty list to hold the data for each posting
                job_post = []
    #append city name
                job_post.append(city)
    #grabbing job title
                for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                    job_post.append(a['title']) 
    #grabbing company name
                    company = div.find_all(name='span', attrs={'class':'company'}) 
                    if len(company) > 0:
                        [job_post.append(b.text.strip()) for b in company]
                    else:
                        [job_post.append(span.text) for span in div.find_all(name='span', attrs={'class':'result-link-source'})]
    #grabbing location name
                    [job_post.append(span.text) for span in div.findAll('span', attrs={'class': 'location'})]
    #grabbing summary text
                    [job_post.append(span.text.strip()) for span in div.findAll('span', attrs={'class': 'summary'})]
    #grabbing salary
                    div_two = div.find(name='div', attrs={'class':'salarySnippet'})
                    job_post.append(div_two.text.strip() if div_two else 'Nothing found')
    #appending list of job post info to dataframe at index num
                    #sample_df.loc[len(sample_df) + 1] = job_post

#saving sample_df as a local csv file — define your own local path to save contents 
                csv_output.writerow([job_post])
                #sample_df.to_csv('[filepath].csv', encoding='utf-8')

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 2021-06-17
    • 1970-01-01
    • 1970-01-01
    • 2019-01-31
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多