【问题标题】:How do I clean addresses so Nominatim can geolocate them properly?如何清理地址以便 Nominatim 可以正确定位它们?
【发布时间】:2020-07-29 08:44:13
【问题描述】:

我正在尝试使用 Nominatim 对从网络上抓取的地址集进行地理定位。 Nominatim 适用于“标准”地址,例如。 123 StreetName St., ExampleSuburb 但我抓取的一些地址有“非标准”元素,例如。 仓库 3,123 StreetName.,ExampleSuburb。

有没有一种方法可以去除“非标准”元素,让 Nominatim 更容易找到它们?或者有没有办法让 Nominatim 尝试在非标准元素的情况下对地址进行地理定位?

例如,下面的代码在执行代码时引发类型错误,我不知道如何修复重新格式化地址以阻止这种情况发生,因为它是直接从网站上刮下来的,而我根本没有干预。

from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

def scrapecafes(city, area):

    #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
    url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
    response = requests.get(url, timeout=5)

    soup_cafe_names = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_names)

    cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
    cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
    #cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]

    #print(cafeNamesClean)

    #addresses
    soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_addresses)

    cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
    cafeAddressesClean = [address.text for address in cafeAddresses]
    #cafeAddressesTuple = [(address,) for address in cafeAddressesClean]

    #print(cafeAddressesClean)


    ##geocode addresses
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    try:
        for item in cafeAddressesClean:
            location = (locator.geocode(item))
            lat = [location.latitude for item in location]
            long = [location.longitude for item in location]
            print(location)

    except:
        pass

    #zip up for table
    fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
    print(fortable)

scrapecafes(melbourne, fitzroy)

【问题讨论】:

    标签: python web-scraping geocoding geopy nominatim


    【解决方案1】:

    您的脚本中有 2 个问题。

    1. 您正在循环通过cafeAddressesClean,但您没有将输出存储在任何地方。
    2. zip 列表后,您不会将它们转换为列表。

    下面将值插入到 sqlite 数据库中。总共有 10 个值被插入。

    from bs4 import BeautifulSoup
    import requests
    from requests import get
    import sqlite3
    import geopandas
    import geopy
    from geopy.geocoders import Nominatim
    from geopy.extra.rate_limiter import RateLimiter
    
    #cafeNamesthornbury
    def scrapecafes(city, area):
    
        #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
        url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
        response = requests.get(url, timeout=5)
    
        soup_cafe_names = BeautifulSoup(response.content, "html.parser")
        type(soup_cafe_names)
    
        cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
        cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
        cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
    
        print(cafeNamesClean)
    
        #addresses
        soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
        type(soup_cafe_addresses)
    
        cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
        cafeAddressesClean = [address.text for address in cafeAddresses]
        cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
    
        print(cafeAddressesClean)
    
    
        ##geocode addresses
        locator = Nominatim(user_agent="myGeocoder")
        geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
    
        location = []
    
        for item in cafeAddressesClean:
            location.append(locator.geocode(item))
    
        lat = [loc.latitude for loc in location]
        long = [loc.longitude for loc in location]
    
        #zip up for table
        fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))
    
    ##connect to database
        try:
            sqliteConnection = sqlite3.connect('25july_database.db')
            cursor = sqliteConnection.cursor()
            print("Database created and Successfully Connected to 25july_database")
    
            sqlite_select_Query = "select sqlite_version();"
            cursor.execute(sqlite_select_Query)
            record = cursor.fetchall()
            print("SQLite Database Version is: ", record)
            cursor.close()
    
        except sqlite3.Error as error:
            print("Error while connecting to sqlite", error)
    
        #create table
        try:
            sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
                                            name TEXT NOT NULL,
                                            address TEXT NOT NULL,
                                            latitude FLOAT NOT NULL,
                                            longitude FLOAT NOT NULL
                                            );'''
    
            cursor = sqliteConnection.cursor()
            print("Successfully Connected to SQLite")
            cursor.execute(sqlite_create_table_query)
            sqliteConnection.commit()
            print("SQLite table created")
        except sqlite3.Error as error:
            print("Error while creating a sqlite table", error)
    
    ##enter data into table
        try:
    
            for row in list(fortable):
                sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""
    
                cursor.execute(sqlite_insert_name_param, row)
    
                sqliteConnection.commit()
            
                print("Total", cursor.rowcount, "Records inserted successfully into table")
    
            cursor.close()
    
        except sqlite3.Error as error:
            print("Failed to insert data into sqlite table", error)
    
        finally:
            if (sqliteConnection):
                sqliteConnection.close()
                print("The SQLite connection is closed")
    
    scrapecafes('melbourne', 'thornbury')
    

    运行脚本后:

    Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
    Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
    Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
    Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
    Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
    Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
    Jerry Joy| 128  Mansfield Street, Thornbury|-37.7573008|145.0096578
    The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
    Little Henri| 848  High Street, Thornbury|51.6087678|-2.5260139
    Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556
    

    【讨论】:

    • 非常感谢!感觉你回答了我很多问题。我很感激。一个问题,当我试图对我的代码进行故障排除时,我已经得到了列表的这一点:for item in cafeAddressesClean: location = (locator.geocode(item)) locList.append(item) 你写了location.append(locator.geocode(item)) 你的代码看起来更干净。我的方法是错的还是乱七八糟的?
    猜你喜欢
    • 1970-01-01
    • 2023-03-23
    • 2013-02-11
    • 2021-11-23
    • 1970-01-01
    • 1970-01-01
    • 2010-11-24
    • 2021-12-21
    • 1970-01-01
    相关资源
    最近更新 更多