【问题标题】:Python - get first a tag parsing HTMLPython - 首先获取一个解析 HTML 的标签
【发布时间】:2014-09-25 02:53:53
【问题描述】:

我正在使用 python 和美丽的汤来解析这个网页。 https://rpi.sodexomyway.com/dining-choices/res/sage.html 在“菜单”部分,我想获取第一个链接的 url。

这是我正在使用的代码:

monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)

现在它正在获得第二个标签,我不知道为什么,我认为它至少会同时获得两个标签,但它只会获得第二个标签。

我想更改代码,使其获得第一个标签,或者我可以通过 a 标签所说的内容进行搜索并得到它。

对于第二部分,我只是在谈论例如,如果 a 标签是

<a new tag </a>

我想搜索“新标签”

编辑:

完整的代码,我需要当前星期的链接,所以要么从菜单部分获取第一个链接,然后按日期搜索该链接...

    #  Created by Spencer Fontein on 5/28/14.
#  Copyright (c) 2014 Spencer Fontein. All rights reserved.

# coding: utf-8

import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re

#where to send the file at the end
output_path = ""#"/home/spencerf/public_html/rpi/"

def Get_website_text(url):

    # url for website        
    base_url = url

    # file for storing cookies       
    cookie_file = 'mfp.cookies'

    # set up a cookie jar to store cookies
    cj = cookielib.MozillaCookieJar(cookie_file)

    # set up opener to handle cookies, redirects etc
    opener = urllib2.build_opener(
         urllib2.HTTPRedirectHandler(),
         urllib2.HTTPHandler(debuglevel=0),
         urllib2.HTTPSHandler(debuglevel=0),            
         urllib2.HTTPCookieProcessor(cj)
    )

    # pretend we're a web browser and not a python script
    opener.addheaders = [('User-agent',
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
         'AppleWebKit/535.1 (KHTML, like Gecko) '
         'Chrome/13.0.782.13 Safari/535.1'))
    ]

    # open the front page of the website to set
    # and save initial cookies
    response = opener.open(base_url)
    web_text = response.read()
    response.close()

    return web_text

#get union menus
def getUnionMenuUrls(soup):
    monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
    #print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
    #print soup.find(text=re.compile('9/22/2014 - 9/28/2014'))
    menu_urls = []
    url = "https://rpi.sodexomyway.com"
    for tag in monthly_urls:
        if ".htm" in tag['href']:
            name = str(tag.text)
            name = name.replace("Click ",'').replace('For ','').replace('Menu ','').replace('of ','').replace('Week ','').replace('Here ','').replace('Of ','')
            name = name.replace('January ','').replace('February ','').replace('March ','').replace('April ','').replace('May ','')
            name = name.replace('June ','').replace('July ','').replace('August ','').replace('September ','')
            name = name.replace('October ','').replace('November ','').replace('December ','')
            name = name.replace('1','').replace("2", '').replace("3", '').replace("4", '')
            name = name.replace('5','').replace("6", '').replace("7", '').replace("8", '')
            name = name.replace('9','').replace("0", '').replace('-','')
            name = name.replace('\n','rpi_russell_sage_menu').replace('/','')
            name = name.replace('!','').replace(', ','').replace(' ','').replace('College','')
            newurl = url + tag['href']
            menu_urls.append([name,newurl])
    return menu_urls


def get_xml(url):
    tag_stack = []
    output_lines = []

    html = urllib2.urlopen(url).read().replace('&nbsp;',"")
    xml = etree.HTML(html)

    open_tag(tag_stack, output_lines, "menu", "")
    days = xml.xpath('//td[@class="dayouter"]')
    # make the xml for each day
    for day in days:
        day_name = day.xpath('./a/@name')[0]
        safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)

        dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr') 
        for dayinner_tr in dayinner_trs:
            # change meal
            if (dayinner_tr.xpath('./td[@class="mealname"]')):
                meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
                safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)

            # change counter
            if (dayinner_tr.xpath('./td[@class="station"]/text()')):                
                counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
                safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)

            # change dish
            if (dayinner_tr.xpath('./td[@class="menuitem"]')):
                item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
                safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
                output_lines.append("<name>%s</name>" % cgi.escape(item_name))

    close_tags(tag_stack, output_lines, "")
    output_string = '\n'.join([line.encode('utf-8') for line in output_lines])

    return output_string

# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
    while tag_stack and tag_stack[-1] != parent_tag:
        top = tag_stack.pop()
        output_lines.append(' ' * len(tag_stack) + '</%s>' % top)

# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
    if name_property:
        output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
    else:
        output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
    tag_stack.append(new_tag)

# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
    if parent_tag not in tag_stack:
        output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
        tag_stack.append(parent_tag)
    else:   
        close_tags(tag_stack, output_lines, parent_tag)
    open_tag(tag_stack, output_lines, new_tag, name_property)

# sample use of get_xml function


# In[17]:

if __name__ == "__main__":
    base_url_u = "https://rpi.sodexomyway.com/dining-choices/res/sage.html"
    htmltext_u = Get_website_text(base_url_u)
    soup_u = BeautifulSoup(htmltext_u)
    menu_url_list = getUnionMenuUrls(soup_u)
    for menu in menu_url_list:
        if '.htm' in menu[1]:
            ofname = str(menu[0].replace(" ","A")) + ".xml"
            output_file = output_path + ofname
            open(output_file, "w").write(get_xml(menu[1]))
        else:
            print menu[0],":",menu[1], "is not valid html."

编辑 2:

date function

def getCurrentWeekMenu(date1,date2):
    now = datetime.datetime.now()
    monthstr = "January,February,March,April,May,June,July,August,September,October,November,December"   
    months = monthstr.split(',')
    d = dict(zip(months,range(1,13))) 
    menu_1_month = d[str(date1[0])]
    menu_2_month = d[str(date2[0])]
    menu_1_day = str(date1[1][:-2])
    menu_2_day = str(date2[1][:-2])
    if menu_1_day > menu_2_day:
        if now.day >= menu_1_day:
            menu = 1
        else:
            menu = 2
    else:
        if now.day >= menu_2_day:
            menu = 2
        elif now.month > menu_1_month:
            menu = 2
        else:
            menu = 1
    return menu-1

【问题讨论】:

  • 你能给出你想要的样本输出吗?

标签: python html parsing beautifulsoup


【解决方案1】:

我运行你的代码没有问题

from BeautifulSoup import BeautifulSoup
import requests
response = requests.get('https://rpi.sodexomyway.com/dining-choices/res/sage.html')
soup = BeautifulSoup(response.text)
#output of your code
print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)

>>> [<a href="#">On the Menu</a>,
     <a href="/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm" target="_blank">
                     9/22/2014 - 9/28/2014</a>,
     <a href="/images/WeeklyMenuRSDH%209-29-14_tcm1068-29441.htm" target="_blank">
                     9/29/2014 - 10/5/2014</a>,
     <a href="#">Hours of Operation</a>]

# now get the href
url = dict(soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1].attrs)['href']
# output
u'/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm'

回答问题的第二部分

import re
soup.find(text=re.compile('new tag'))

更新 - 添加当前周过滤器

def getUnionMenuUrls(soup):                                                      
    monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1:3] # cut extra links
    today = datetime.datetime.today() # get todays date                          
    url = "https://rpi.sodexomyway.com"                                          
    for tag in monthly_urls:                                                      
        if ".htm" in tag['href']:                                                
            name = str(tag.text)                                                 
            datestrings = name.split(' - ') # split string and get the list of dates
            date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
            if date_range[0] <= today <= date_range[1]: # check if today in that range
                return url + tag['href']

【讨论】:

  • 你的权利不能成为问题代码的一部分,让我发布我的完整脚本,我要么想获得第一个链接作为我的输出,要么我可以搜索第一个标签是本周的,所以它只会得到那个标签。这有意义吗?
  • 我不认为按名称搜索是个好主意,因为绑定到页面架构总是好的,但不绑定到内容。如果您需要以最小的痛苦获得 menu_urls 中的第一个链接 - 使用 monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1:2] # line 54
  • 好的,这样可以获得第一个,但我想获得当前的一周,我将如何去做我已经在我提出的主要问题中添加了一个函数来获取日期,然后我想获取本周的链接,有什么想法吗?
  • 你可以忽略我发布的功能,我只是尽可能地询问是否有办法获得本周的链接?有意义吗?
  • 我已根据您的要求更新了我的答案,请注意,它只返回新的 url,而不是 [name, url]
猜你喜欢
  • 1970-01-01
  • 2014-03-27
  • 2012-02-24
  • 1970-01-01
  • 1970-01-01
  • 2012-07-22
  • 2018-10-12
  • 2012-04-13
  • 1970-01-01
相关资源
最近更新 更多