如何清理地址以便 Nominatim 可以正确定位它们？答案

【问题标题】：How do I clean addresses so Nominatim can geolocate them properly?如何清理地址以便 Nominatim 可以正确定位它们？
【发布时间】：2020-07-29 08:44:13
【问题描述】：

我正在尝试使用 Nominatim 对从网络上抓取的地址集进行地理定位。 Nominatim 适用于“标准”地址，例如。 123 StreetName St., ExampleSuburb 但我抓取的一些地址有“非标准”元素，例如。 仓库 3，123 StreetName.，ExampleSuburb。

有没有一种方法可以去除“非标准”元素，让 Nominatim 更容易找到它们？或者有没有办法让 Nominatim 尝试在非标准元素的情况下对地址进行地理定位？

例如，下面的代码在执行代码时引发类型错误，我不知道如何修复重新格式化地址以阻止这种情况发生，因为它是直接从网站上刮下来的，而我根本没有干预。

from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

def scrapecafes(city, area):

    #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
    url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
    response = requests.get(url, timeout=5)

    soup_cafe_names = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_names)

    cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
    cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
    #cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]

    #print(cafeNamesClean)

    #addresses
    soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_addresses)

    cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
    cafeAddressesClean = [address.text for address in cafeAddresses]
    #cafeAddressesTuple = [(address,) for address in cafeAddressesClean]

    #print(cafeAddressesClean)


    ##geocode addresses
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    try:
        for item in cafeAddressesClean:
            location = (locator.geocode(item))
            lat = [location.latitude for item in location]
            long = [location.longitude for item in location]
            print(location)

    except:
        pass

    #zip up for table
    fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
    print(fortable)

scrapecafes(melbourne, fitzroy)

【问题讨论】：

标签： python web-scraping geocoding geopy nominatim

【解决方案1】：

您的脚本中有 2 个问题。

您正在循环通过cafeAddressesClean，但您没有将输出存储在任何地方。
zip 列表后，您不会将它们转换为列表。

下面将值插入到 sqlite 数据库中。总共有 10 个值被插入。

from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

#cafeNamesthornbury
def scrapecafes(city, area):

    #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
    url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
    response = requests.get(url, timeout=5)

    soup_cafe_names = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_names)

    cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
    cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
    cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]

    print(cafeNamesClean)

    #addresses
    soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_addresses)

    cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
    cafeAddressesClean = [address.text for address in cafeAddresses]
    cafeAddressesTuple = [(address,) for address in cafeAddressesClean]

    print(cafeAddressesClean)


    ##geocode addresses
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    location = []

    for item in cafeAddressesClean:
        location.append(locator.geocode(item))

    lat = [loc.latitude for loc in location]
    long = [loc.longitude for loc in location]

    #zip up for table
    fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))

##connect to database
    try:
        sqliteConnection = sqlite3.connect('25july_database.db')
        cursor = sqliteConnection.cursor()
        print("Database created and Successfully Connected to 25july_database")

        sqlite_select_Query = "select sqlite_version();"
        cursor.execute(sqlite_select_Query)
        record = cursor.fetchall()
        print("SQLite Database Version is: ", record)
        cursor.close()

    except sqlite3.Error as error:
        print("Error while connecting to sqlite", error)

    #create table
    try:
        sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
                                        name TEXT NOT NULL,
                                        address TEXT NOT NULL,
                                        latitude FLOAT NOT NULL,
                                        longitude FLOAT NOT NULL
                                        );'''

        cursor = sqliteConnection.cursor()
        print("Successfully Connected to SQLite")
        cursor.execute(sqlite_create_table_query)
        sqliteConnection.commit()
        print("SQLite table created")
    except sqlite3.Error as error:
        print("Error while creating a sqlite table", error)

##enter data into table
    try:

        for row in list(fortable):
            sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""

            cursor.execute(sqlite_insert_name_param, row)

            sqliteConnection.commit()
        
            print("Total", cursor.rowcount, "Records inserted successfully into table")

        cursor.close()

    except sqlite3.Error as error:
        print("Failed to insert data into sqlite table", error)

    finally:
        if (sqliteConnection):
            sqliteConnection.close()
            print("The SQLite connection is closed")

scrapecafes('melbourne', 'thornbury')

运行脚本后：

Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
Jerry Joy| 128  Mansfield Street, Thornbury|-37.7573008|145.0096578
The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
Little Henri| 848  High Street, Thornbury|51.6087678|-2.5260139
Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556

【讨论】：

非常感谢！感觉你回答了我很多问题。我很感激。一个问题，当我试图对我的代码进行故障排除时，我已经得到了列表的这一点：for item in cafeAddressesClean: location = (locator.geocode(item)) locList.append(item) 你写了location.append(locator.geocode(item)) 你的代码看起来更干净。我的方法是错的还是乱七八糟的？