
基于百度地图API的全国公园信息爬取系统,支持获取城市列表、公园基本信息及详细信息。
本项目通过调用百度地图Place API,系统化地爬取全国28个省级行政区的公园数据,包括:
项目使用MySQL存储数据,包含三张主要表:
存储拥有公园的城市信息
id: 主键name: 城市名称num: 公园数量存储公园基本信息
id: 主键city: 所属城市park: 公园名称location_lat: 纬度location_lng: 经度address: 地址uid: 百度地图唯一标识存储公园详细信息
id: 主键park: 公园名称location_lat/location_lng: 经纬度address: 地址telephone: 电话tag: 标签overall_rating: 综合评分comment_num: 评论数量image_num: 图片数量shop_hours: 营业时间description: 描述citys_garden_num.txt | |||
city 表 | |||
park 表 | |||
park_detail 表 |
pip install mysqlclient requests修改各Python文件中的数据库连接信息(当前配置在代码中):
conn = MySQLdb.connect( host='换成你自己的数据库地址', user='park', password='Park_2025', port=3306, db='park')步骤1:获取城市列表(二选一)
# 仅保存到文本文件python get_city.py# 或直接保存到数据库python init_city_park.py步骤2:爬取公园基本信息
python get_park.py注意:此步骤会遍历所有城市,分页获取每个城市的公园数据
步骤3:爬取公园详细信息
python detail.py注意:此步骤会根据已获取的公园UID,逐个查询详细信息
覆盖全国28个省级行政区:
江苏省、浙江省、广东省、福建省、山东省、河南省、河北省、四川省、辽宁省、云南省、湖南省、湖北省、江西省、安徽省、山西省、广西壮族自治区、陕西省、黑龙江省、内蒙古自治区、贵州省、吉林省、甘肃省、新疆维吾尔自治区、海南省、宁夏回族自治区、青海省、西藏自治区
本项目使用百度地图Place API v2:
http://api.map.baidu.com/place/v2/searchhttp://api.map.baidu.com/place/v2/detail需要替换代码中的AK(API Key)为自己的密钥
.├── get_city.py # 获取城市列表(保存到txt)├── init_city_park.py # 获取城市列表(保存到数据库)├── get_park.py # 爬取公园基本信息├── detail.py # 爬取公园详细信息├── sql/ # 数据库SQL文件│ ├── city.sql # 城市数据│ ├── park.sql # 公园基本信息│ └── park_detail.sql # 公园详细信息├── citys_garden_num.txt # 城市公园数量文本文件└── README.md # 项目说明文档# coding=utf-8import requestsimport jsonimport MySQLdbimport time'''获取所有公园的详细信息'''conn = MySQLdb.connect(host='****',user='park',password='Park_2025',port=3306,db='park')cur=conn.cursor()# 查询所有公园的UIDsql="SELECT uid FROM park WHERE uid IS NOT NULL AND uid != ''"cur.execute(sql)results=cur.fetchall() #将返回所有结果,返回二维元组,如(('id','name'),('id','name'))print(f"总共查询到 {len(results)} 个公园UID")def get_json(uid): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16' } params = { 'uid': uid, 'output': 'json', #输出格式为json 'scope': '2', #检索结果详细程度。取值为1 或空,则返回基本信息;取值为2,返回检索POI详细信息 'ak': '换成你自己的ak' } try: res = requests.get("http://api.map.baidu.com/place/v2/detail", params=params, headers=headers, timeout=30) content = res.text decodejson = json.loads(content) #将已编码的 JSON 字符串解码为 Python 对象,就是python解码json对象 return decodejson except Exception as e: print(f"请求UID {uid} 时发生错误: {e}") return {'status': -1, 'message': str(e)}success_count = 0error_count = 0for i, row in enumerate(results, 1): uid = row[0] print(f"正在处理第 {i}/{len(results)} 个公园: {uid}") decodejson = get_json(uid) # 检查API返回状态 if decodejson.get('status') == 0 and 'result' in decodejson: info = decodejson['result'] try: park = info.get('name') location_lat = info.get('location', {}).get('lat') location_lng = info.get('location', {}).get('lng') address = info.get('address') street_id = info.get('street_id') telephone = info.get('telephone') detail = info.get('detail') tag = info.get('detail_info', {}).get('tag') detail_url = info.get('detail_info', {}).get('detail_url') type = info.get('detail_info', {}).get('type') overall_rating = info.get('detail_info', {}).get('overall_rating') image_num = info.get('detail_info', {}).get('image_num') comment_num = info.get('detail_info', {}).get('comment_num') # 处理关键词 key_words = None try: key_words_list = info.get('detail_info', {}).get('di_review_keyword', []) if key_words_list: key_words = '/'.join([item.get('keyword', '') for item in key_words_list]) except: pass shop_hours = info.get('detail_info', {}).get('shop_hours') alias = info.get('detail_info', {}).get('alias') scope_type = info.get('detail_info', {}).get('scope_type') scope_grade = info.get('detail_info', {}).get('scope_grade') description = info.get('detail_info', {}).get('description') # 检查是否已存在相同UID的记录 cur.execute("SELECT id FROM park_detail WHERE uid = %s", (uid,)) if cur.fetchone(): print(f" UID {uid} 已存在,跳过") continue # 插入数据到park_detail表 sql = """INSERT INTO park_detail (park, location_lat, location_lng, address, street_id, uid, telephone, detail, tag, detail_url, type, overall_rating, image_num, comment_num, keyword, shop_hours, alias, scope_type, scope_grade, description) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" cur.execute(sql, (park, location_lat, location_lng, address, street_id, uid, telephone, detail, tag, detail_url, type, overall_rating, image_num, comment_num, key_words, shop_hours, alias, scope_type, scope_grade, description)) success_count += 1 print(f" 成功保存公园: {park}") # 每10条记录提交一次 if success_count % 10 == 0: conn.commit() print(f" 已提交 {success_count} 条记录") except Exception as e: error_count += 1 print(f" 处理UID {uid} 时发生错误: {e}") continue else: error_count += 1 print(f" UID {uid} API请求失败: {decodejson.get('message', '未知错误')}") # 添加延迟,避免请求过于频繁 time.sleep(0.5)# 最终提交并关闭连接conn.commit()cur.close()conn.close()print(f"\n处理完成!")print(f"成功保存: {success_count} 条记录")print(f"失败: {error_count} 条记录")# coding=utf-8import requestsimport json'''获取所有拥有公园的城市,并把数据存储到txt文本中'''def get_json(region): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16' } params = { 'query': '公园', #检索关键字 'region': region, #检索行政区划区域 'output': 'json', #输出格式为json 'scope': '1', #检索结果详细程度。取值为1 或空,则返回基本信息;取值为2,返回检索POI详细信息 'page_size': 20, #单次召回POI数量,默认为10条记录,最大返回20条。 'page_num': 0, #分页页码,默认为0,0代表第一页,1代表第二页,以此类推。 'ak': 'QG8oPpb**********换成你的ak' } res = requests.get("http://api.map.baidu.com/place/v2/search", params=params, headers=headers) content = res.text decodejson = json.loads(content) #将已编码的 JSON 字符串解码为 Python 对象,就是python解码json对象 return decodejsonprovince_list = ['江苏省', '浙江省', '广东省', '福建省', '山东省', '河南省', '河北省', '四川省', '辽宁省', '云南省', '湖南省', '湖北省', '江西省', '安徽省', '山西省', '广西壮族自治区', '陕西省', '黑龙江省', '内蒙古自治区', '贵州省', '吉林省', '甘肃省', '新疆维吾尔自治区', '海南省', '宁夏回族自治区', '青海省', '西藏自治区']for eachprovince in province_list: decodejson = get_json(eachprovince) if 'results' in decodejson and decodejson['results']: province_cities = 0 for eachcity in decodejson['results']: city = eachcity['name'] num = eachcity['num'] content = '\t'.join([city,str(num)])+'\r\n' with open('citys_garden_num.txt','a+',encoding='utf-8') as f: f.write(content) f.close()# coding=utf-8import requestsimport jsonimport MySQLdb'''获取所有城市的公园数据'''conn = MySQLdb.connect(host='换成你自己的数据库',user='park',password='Park_2025',port=3306,db='park')cur=conn.cursor()# 清空park表,避免重复数据cur.execute("DELETE FROM park")print("已清空park表")def get_json(region, page_num): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16' } params = { 'query': '公园', #检索关键字 'region': region, #检索行政区划区域 'output': 'json', #输出格式为json 'scope': '1', #检索结果详细程度。取值为1 或空,则返回基本信息;取值为2,返回检索POI详细信息 'page_size': 20, #单次召回POI数量,默认为10条记录,最大返回20条。 'page_num': page_num, #分页页码,默认为0,0代表第一页,1代表第二页,以此类推。 'ak': 'QG8oP换成你自己的AK' } res = requests.get("http://api.map.baidu.com/place/v2/search", params=params, headers=headers) content = res.text decodejson = json.loads(content) return decodejson# 从city表查询所有城市名city_list = []cur.execute("SELECT name FROM city order by num desc")results = cur.fetchall()for row in results: city_list.append(row[0])print(f"从数据库获取到 {len(city_list)} 个城市")for eachcity in city_list: not_last_page = True page_num = 0 city_park_count = 0 print(f"正在处理城市: {eachcity}") while not_last_page: decodejson = get_json(eachcity, page_num) # 判断API返回结果是否有效 if 'results' in decodejson and decodejson['results']: for eachone in decodejson['results']: try: park = eachone['name'] except: park = None try: location_lat = eachone['location']['lat'] except: location_lat = None try: location_lng = eachone['location']['lng'] except: location_lng = None try: address = eachone['address'] except: address = None try: street_id = eachone['street_id'] except: street_id = None try: uid = eachone['uid'] except: uid = None # 检查是否已存在相同UID的记录 if uid: cur.execute("SELECT id FROM park WHERE uid = %s", (uid,)) if cur.fetchone(): continue # 跳过已存在的记录 sql = """INSERT INTO park (city, park, location_lat, location_lng, address, uid) VALUES (%s, %s, %s, %s, %s, %s)""" cur.execute(sql, (eachcity, park, location_lat, location_lng, address, uid)) city_park_count += 1 # 如果返回的数据少于20条,说明已经是最后一页 if len(decodejson['results']) < 20: not_last_page = False else: page_num += 1 else: not_last_page = False print(f"{eachcity} 完成,获取到 {city_park_count} 个公园") conn.commit() # 每个城市完成后提交事务cur.close()conn.close()print("所有城市公园数据获取完成!")# coding=utf-8import requestsimport jsonimport MySQLdbimport time'''获取所有拥有公园的城市,并把数据存储到数据库中'''def get_json(region): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16' } params = { 'query': '公园', #检索关键字 'region': region, #检索行政区划区域 'output': 'json', #输出格式为json 'scope': '1', #检索结果详细程度。取值为1 或空,则返回基本信息;取值为2,返回检索POI详细信息 'page_size': 20, #单次召回POI数量,默认为10条记录,最大返回20条。 'page_num': 0, #分页页码,默认为0,0代表第一页,1代表第二页,以此类推。 'ak': '换成你自己的ak' } try: res = requests.get("http://api.map.baidu.com/place/v2/search", params=params, headers=headers, timeout=30) content = res.text decodejson = json.loads(content) return decodejson except Exception as e: print(f"请求 {region} 时发生错误: {e}") return {'status': -1, 'message': str(e)}# 连接数据库conn = MySQLdb.connect(host='换成你自己的数据地址',user='park',password='Park_2025',port=3306,db='park')cur = conn.cursor()# 清空city表,避免重复数据cur.execute("DELETE FROM city")print("已清空city表")province_list = ['江苏省', '浙江省', '广东省', '福建省', '山东省', '河南省', '河北省', '四川省', '辽宁省', '云南省', '湖南省', '湖北省', '江西省', '安徽省', '山西省', '广西壮族自治区', '陕西省', '黑龙江省', '内蒙古自治区', '贵州省', '吉林省', '甘肃省', '新疆维吾尔自治区', '海南省', '宁夏回族自治区', '青海省', '西藏自治区']total_cities = 0for eachprovince in province_list: print(f"\n正在处理省份: {eachprovince}") decodejson = get_json(eachprovince) # 检查API返回状态 if decodejson.get('status') == 0: # 判断API返回结果是否有效 if 'results' in decodejson and decodejson['results']: province_cities = 0 for eachcity in decodejson['results']: try: city = eachcity['name'] num = eachcity['num'] # 插入数据到数据库 sql = "INSERT INTO city (name, num) VALUES (%s, %s)" cur.execute(sql, (city, num)) province_cities += 1 total_cities += 1 except KeyError as e: print(f" 跳过无效城市数据: {eachcity}, 错误: {e}") except Exception as e: print(f" 插入城市数据失败: {eachcity}, 错误: {e}") print(f" {eachprovince} 获取到 {province_cities} 个城市") else: print(f" {eachprovince} 没有返回城市数据") else: print(f" {eachprovince} API请求失败: {decodejson.get('message', '未知错误')}") # 添加延迟,避免请求过于频繁 time.sleep(1)# 提交事务并关闭连接conn.commit()cur.close()conn.close()print(f"\n城市数据获取完成!总共获取到 {total_cities} 个城市")DROP TABLE IF EXISTS `city`;CREATE TABLE `city` ( `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, `num` int NULL DEFAULT NULL, PRIMARY KEY (`name`) USING BTREE) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;DROP TABLE IF EXISTS `park_detail`;CREATE TABLE `park_detail` ( `id` int NOT NULL AUTO_INCREMENT, `park` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, `location_lat` float NULL DEFAULT NULL, `location_lng` float NULL DEFAULT NULL, `address` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `street_id` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `telephone` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `detail` int NULL DEFAULT NULL, `uid` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `tag` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `type` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `detail_url` varchar(800) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `price` int NULL DEFAULT NULL, `overall_rating` float NULL DEFAULT NULL, `image_num` int NULL DEFAULT NULL, `comment_num` int NULL DEFAULT NULL, `shop_hours` varchar(800) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `alias` varchar(800) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `keyword` varchar(800) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `scope_type` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `scope_grade` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `description` varchar(9000) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `created_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) USING BTREE) ENGINE = InnoDB AUTO_INCREMENT = 288 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;DROP TABLE IF EXISTS `park`;CREATE TABLE `park` ( `id` int NOT NULL AUTO_INCREMENT, `city` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, `park` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, `location_lat` float NULL DEFAULT NULL, `location_lng` float NULL DEFAULT NULL, `address` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `uid` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `created_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) USING BTREE) ENGINE = InnoDB AUTO_INCREMENT = 1922 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;喜欢我的文章点点关注⭐
