mongo 去重

import pymongo

client = pymongo.MongoClient()
collection=client.t.test

# collection.insert({'title':'python','name':'deng','age':23})

data={'title':'go','name':'wang','age':45,'url':1}
collection.update({'url':1},{'$set':data},True)

# 上面的案例,表示如何url重复的话,url不更新,其他字典如果数据不一致就会更新。

 

爬虫案例:

collection.update({'url':data['url'],'cover_url':data['cover_url']},{'$set':data},True)
# coding=utf8
"""
author:dengjiyun
"""
import pymongo

client=pymongo.MongoClient()
collection = client.dou.douban

import requests
url='https://movie.douban.com/j/chart/top_list'

params={
    'type':'11',
    'interval_id':'100:90',
    'action':'',
    'start':'60',
    'limit':'20'
}
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
}

res=requests.get(url,params=params,headers=headers).json()

for item in res:
    data={}
    # print(item['cover_url'])
    data['vote_count']=item['vote_count']  # 评论
    data['score']=item['score']       # 得分
    data['title']=item['title']       # 电影名
    data['url']=item['url']         # 详情页url
    data['cover_url']=item['cover_url']   # 封面图片
    data['rank'] =item['rank']       # 排名
    data['id'] =item['id']         # 电影id
    data['release_date']=item['release_date'] # 发布日期

    print(item)
    # 不插入重复数据  collection.update()
    collection.update({'url':data['url'],'cover_url':data['cover_url']},{'$set':data},True)
client.close()

 

相关文章:

  • 2022-01-20
  • 2021-06-02
  • 2021-11-06
  • 2021-06-12
  • 2021-12-03
  • 2021-07-18
  • 2022-12-23
  • 2021-04-26
猜你喜欢
  • 2021-12-12
  • 2022-12-23
  • 2021-11-02
  • 2021-07-30
  • 2022-12-23
  • 2021-05-11
相关资源
相似解决方案