处理地理数据,GeoPandas是Python生态的瑞士军刀。它把Pandas和Shapely结合起来,让空间数据处理变得简单。
GeoPandas是什么?
一句话:地理数据版的Pandas。
核心能力:
安装
核心数据结构
GeoDataFrame
import geopandas as gpd# 创建GeoDataFramegdf = gpd.GeoDataFrame({ 'name': ['公寓A', '公寓B', '公寓C'], 'price': [1500, 2000, 1200], 'geometry': [ gpd.points_from_xy([113.65], [34.76])[0], gpd.points_from_xy([113.70], [34.78])[0], gpd.points_from_xy([113.62], [34.75])[0] ]})print(gdf)
GeoSeries
# 几何列 geometries = gpd.points_from_xy([113.65, 113.70], [34.76, 34.78])gs = gpd.GeoSeries(geometries)
读写数据
读取Shapefile
gdf = gpd.read_file('apartments.shp')print(gdf.head())print(gdf.crs)# 坐标系
读取GeoJSON
gdf = gpd.read_file('apartments.geojson')
读取PostGIS
from sqlalchemy import create_engineengine = create_engine('postgresql://user:pass@localhost:5432/gisdb')gdf = gpd.read_postgis( "SELECT * FROM apartments WHERE district = '金水区'", engine, geom_col='geom' )
写入文件
# 写入Shapefilegdf.to_file('output.shp', encoding='utf-8')# 写入GeoJSONgdf.to_file('output.geojson', driver='GeoJSON')# 写入PostGISgdf.to_postgis('apartments', engine, if_exists='append')
空间查询
范围查询
from shapely.geometry import Point# 创建中心点center = Point(113.65, 34.76)# 1公里缓冲区buffer = center.buffer(0.01)# 约1公里# 空间查询within = gdf[gdf.geometry.within(buffer)]print(f”范围内有 {len(within)} 个点”)
最近邻查询
def find_nearest(gdf, point, n=5): ”””找到最近的n个点””” distances = gdf.geometry.distance(point) nearest_idx = distances.nsmallest(n).index return gdf.loc[nearest_idx] point = Point(113.65, 34.76)nearest = find_nearest(gdf, point, 5)
空间分析
缓冲区分析
# 创建缓冲区gdf['buffer_1km'] = gdf.geometry.buffer(0.01)# 约1公里# 可视化gdf['buffer_1km'].plot(alpha=0.3)
叠加分析
# 读取两个图层apartments = gpd.read_file('apartments.shp')districts = gpd.read_file('districts.shp')# 空间连接:公寓属于哪个区joined = gpd.sjoin(apartments, districts, how='left', op='within')
面积计算
# 转换为投影坐标(单位:米)gdf_3857 = gdf.to_crs(epsg=3857)# 计算面积gdf['area_sqm'] = gdf_3857.geometry.areagdf['area_sqkm'] = gdf['area_sqm'] / 1e6
坐标系转换
# 查看当前坐标系print(gdf.crs)# 转换坐标系gdf_3857 = gdf.to_crs(epsg=3857)# 转Web墨卡托gdf_4326 = gdf_3857.to_crs(epsg=4326)# 转回WGS84
可视化
基础绘图
import matplotlib.pyplot as plt# 绘制点图gdf.plot(figsize=(10, 10), color='red', markersize=10)plt.title('人才公寓分布')plt.show()
分类绘图
# 按区域分类着色gdf.plot(column='district', figsize=(10, 10), legend=True)plt.title('按区域分类')plt.show()
数值着色
# 按价格着色gdf.plot(column='price', figsize=(10, 10), legend=True, cmap='YlOrRd')plt.title('价格分布')plt.show()
实战:人才公寓分析
import geopandas as gpdfrom shapely.geometry import Point# 1. 读取数据gdf = gpd.read_file('apartments.shp')# 2. 坐标系转换gdf = gdf.to_crs(epsg=3857)# 3. 创建缓冲区gdf['buffer_1km'] = gdf.geometry.buffer(1000)# 1公里# 4. 统计每个区的公寓数量district_count = gdf.groupby('district').size()print(”各区公寓数量:”)print(district_count)# 5. 找到最近的公寓center = Point(113.65, 34.76)center_3857 = gpd.GeoSeries([center], crs=4326).to_crs(epsg=3857).iloc[0]gdf['distance'] = gdf.geometry.distance(center_3857)nearest = gdf.nsmallest(5, 'distance')print(”\n最近的5个公寓:”)print(nearest[['name', 'district', 'distance']])
性能优化
1. 空间索引
# 创建空间索引gdf.sindex# 使用空间索引查询from shapely.geometry import boxbbox = box(113.60, 34.70, 113.70, 34.80)possible_matches_idx = list(gdf.sindex.intersection(bbox.bounds))possible_matches = gdf.iloc[possible_matches_idx]
2. 分块处理
def process_large_file(input_path, output_path, chunk_size=10000): ”””分块处理大文件””” import fiona with fiona.open(input_path) as src: for i, chunk in enumerate(gpd.read_file(input_path, chunksize=chunk_size)):# 处理每个块 processed = process_chunk(chunk)# 写入结果 if i == 0: processed.to_file(output_path) else: processed.to_file(output_path, mode='a')
常见问题
Q1:读取很慢怎么办?
# 只读取需要的列gdf = gpd.read_file('data.shp', columns=['name', 'geometry'])# 只读取指定区域gdf = gpd.read_file('data.shp', bbox=(113.6, 34.7, 113.7, 34.8))
Q2:坐标系不对怎么办?
# 检查坐标系print(gdf.crs)# 设置坐标系(如果缺失)gdf = gdf.set_crs(epsg=4326)# 转换坐标系gdf = gdf.to_crs(epsg=3857)
总结
GeoPandas的核心:
- 数据结构:GeoDataFrame = DataFrame + Geometry
- 读写
- 分析
- 可视化
记住:GeoPandas是Python GIS的基石,必须掌握。
作者:坐标系6297 | 用地图看懂世界,用数据讲清故事