名师讲堂|使用 Python 基于 LandScan 数据测算城市多中心指标(商玉萍版本)

coords = np.column_stack([hh_gdf.geometry.x, hh_gdf.geometry.y])

# 构建邻接关系

w_hh = DistanceBand.from_array(coords, threshold=dist_nb, silence_warnings=True)

# 获取连通分量（聚类）

components = w_hh.component_labels

hh_gdf = hh_gdf.copy()

hh_gdf['cluster'] = components

# 按 cluster 汇总

centers = hh_gdf.groupby('cluster').agg({

'pop': 'sum',

'geometry': lambda x: (np.mean(x.x), np.mean(x.y))

}).reset_index()

centers['n'] = hh_gdf.groupby('cluster').size().values

centers['x'] = centers['geometry'].apply(lambda p: p[0])

centers['y'] = centers['geometry'].apply(lambda p: p[1])

centers = centers.drop('geometry', axis=1)

return centers

centers = cluster_centers(hh, ANALYSIS_PARAMS['dist_nb'])

# 筛选有效中心（人口≥10万，格点数≥3）

centers = centers[(centers['n'] >= ANALYSIS_PARAMS['min_cells']) &

(centers['pop'] >= ANALYSIS_PARAMS['min_pop'])]

print(centers)

4.6 计算 5 大指标

defcalc_paper_indicators(city_code, pop_file, city_all, lw=None, params=None):

五、批量串行计算（全国所有城市 × 所有年份）

5.1 封装核心计算函数

将以上步骤封装为函数，接收城市代码和人口 tif 文件路径，返回该城市该年份的 5 大指标：

"""计算单个城市单年份的 5 大指标"""

if params isNone:

params = ANALYSIS_PARAMS

city = city_all[city_all['city_code'] == city_code]

# 获取当前城市

iflen(city) == 0:

city_geom = city.geometry.iloc[0]

city_name = city['city_name'].iloc[0]

# 栅格转点

pts = raster_to_points(pop_file, city_geom)

iflen(pts) == 0:

coords = np.column_stack([pts.geometry.x, pts.geometry.y])

# 构建空间权重矩阵（如果未提供）

if lw isNone:

w = build_spatial_weights(coords, params['dist_nb'])

else:

w = lw

# 局部 Moran's I 和 LISA 分类（条件正态近似法）

p_values, lag_val, types = local_moran_lisa(pts['pop'].values, w, params['sig_level'])

pts['type'] = types

# 筛选 HH 格点

hh = pts[pts['type'] == 'HH'].copy()

iflen(hh) < params['min_cells']:

centers = cluster_centers(hh, params['dist_nb'])

# 聚类识别中心

centers = centers[(centers['n'] >= params['min_cells']) &

(centers['pop'] >= params['min_pop'])]

iflen(centers) == 0:

pareto = calc_pareto_index(centers)

# 计算指标

center_n = len(centers)

poly = calc_polycentric_index(centers)

cbd_idx = pts['pop'].idxmax()

cbd_geom = pts.loc[cbd_idx, 'geometry']

sub_results = calc_decentralization(pts, cbd_geom)

# 提取年份

year_match = re.search(r'(\d{4})', os.path.basename(pop_file))

year = int(year_match.group(1)) if year_match elseNone

return pd.DataFrame({

'year': [year],

'city_code': [city_code],

'city_name': [city_name],

'center': [center_n],

'pareto': [round(pareto, 4)],

'poly': [round(poly, 4)],

'sub3': [round(sub_results['sub3'], 4)],

'sub5': [round(sub_results['sub5'], 4)],

'total_pop': [pts['pop'].sum()]

})

except Exception as e:

print(f"计算失败: {city_code}, {pop_file}, 错误: {e}")

result = calc_paper_indicators(110000, pop_file, city_demo)

# 测试函数

print(result)

5.2 加载全部城市与年份文件

# 加载城市数据

city_all = gpd.read_file(PATH['city_shp'])

city_all = city_all.to_crs(MYCRS)

city_all['city_code'] = city_all['市代码']

city_all['city_name'] = city_all['市']

# 索引所有年份 tif 文件

pop_files = sorted(Path(PATH['pop_tif']).glob("*.tif"))

print(f"找到 {len(pop_files)} 个年份文件")

5.3 串行计算所有城市

defcompute_lw(city_code, city_all, sample_pop_file, output_dir="lwres"):

六、改进算法：预计算空间权重矩阵

6.1 为什么可以改进？

在上面的批量计算中，对同一个城市，每个年份都重新计算了一次空间权重矩阵。但实际上，空间权重矩阵只取决于城市边界的形状——它与年份无关，只要城市行政边界不变（本项目使用 2021 年固定边界），同一城市所有年份的权重矩阵完全相同。

因此，可以先把所有城市的权重矩阵计算并保存好，然后在计算各年数据时直接读取，显著减少重复计算。

对于一个有 NNN 个城市、TTT 个年份的数据集：

方法	权重矩阵计算次数
原始方法	N×TN \times TN×T
改进方法	NNN （预计算一次）

当 T=25T = 25T=25（2000～2024 年）时，改进方法可将权重矩阵的计算量缩减为原来的 1/25。

6.2 预计算并保存所有城市的权重矩阵

import pickle

"""预计算单个城市的空间权重矩阵"""

city = city_all[city_all['city_code'] == city_code]

iflen(city) == 0:

city_geom = city.geometry.iloc[0]

# 使用样本人口文件确定格点位置

pts = raster_to_points(sample_pop_file, city_geom)

iflen(pts) == 0:

coords = np.column_stack([pts.geometry.x, pts.geometry.y])

w = build_spatial_weights(coords, ANALYSIS_PARAMS['dist_nb'])

# 保存权重矩阵

os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, f"{city_code}.pkl")

withopen(output_path, 'wb') as f:

pickle.dump(w, f)

return w

except Exception as e:

print(f"权重矩阵计算失败: {city_code}, 错误: {e}")

# （joblib.Parallel 在 reticulate 环境下无法序列化 __main__ 中定义的函数，

# 串行计算所有城市的权重矩阵

# 因此 Rmd 中使用串行循环；如需并行请使用独立的 batch_calculate.py 脚本）

sample_pop_file = os.path.join(PATH['pop_tif'], "2020.tif")

for code in city_demo['city_code'].unique():

compute_lw(code, city_demo, sample_pop_file)

print("======== 所有城市权重矩阵计算完成，保存在 lwres/ ========")

6.3 使用预计算权重矩阵的改进版计算函数

改进版函数从外部接收 lw 参数，不在函数内部重新计算权重矩阵：

defload_lw(city_code, lw_dir="lwres"):

"""加载预计算的空间权重矩阵"""

lw_path = os.path.join(lw_dir, f"{city_code}.pkl")

if os.path.exists(lw_path):

withopen(lw_path, 'rb') as f:

return pickle.load(f)

defcalc_paper_indicators2(current_city, pop_file, lw, params=None):

"""改进版计算函数（接受外部 lw）"""

if params isNone:

params = ANALYSIS_PARAMS

city_geom = current_city.geometry.iloc[0]

city_code = current_city['city_code'].iloc[0]

city_name = current_city['city_name'].iloc[0]

# 栅格转点

pts = raster_to_points(pop_file, city_geom)

iflen(pts) == 0:

p_values, lag_val, types = local_moran_lisa(pts['pop'].values, lw, params['sig_level'])

# 直接使用传入的 lw（条件正态近似法）

pts['type'] = types

# 筛选 HH 格点

hh = pts[pts['type'] == 'HH'].copy()

iflen(hh) < params['min_cells']:

centers = cluster_centers(hh, params['dist_nb'])

# 聚类识别中心

centers = centers[(centers['n'] >= params['min_cells']) &

(centers['pop'] >= params['min_pop'])]

iflen(centers) == 0:

pareto = calc_pareto_index(centers)

# 计算指标

center_n = len(centers)

poly = calc_polycentric_index(centers)

cbd_idx = pts['pop'].idxmax()

cbd_geom = pts.loc[cbd_idx, 'geometry']

sub_results = calc_decentralization(pts, cbd_geom)

year_match = re.search(r'(\d{4})', os.path.basename(pop_file))

year = int(year_match.group(1)) if year_match elseNone

return pd.DataFrame({

'year': [year],

'city_code': [city_code],

'city_name': [city_name],

'center': [center_n],

'pareto': [round(pareto, 4)],

'poly': [round(poly, 4)],

'sub3': [round(sub_results['sub3'], 4)],

'sub5': [round(sub_results['sub5'], 4)],

'total_pop': [pts['pop'].sum()]

})

except Exception as e:

city_demo_single = city_demo[city_demo['city_code'] == 110000]

# 测试改进版函数

lw_demo = load_lw(110000)

result = calc_paper_indicators2(city_demo_single, pop_file, lw=lw_demo)

print(result)

6.4 使用改进算法批量计算全部数据

defmerge_results(input_dir="resb", output_file="results.dta"):

6.5 合并所有结果并保存

"""合并所有结果文件"""

all_files = list(Path(input_dir).glob("*.csv"))

iflen(all_files) == 0:

print("未找到结果文件")

dfs = [pd.read_csv(f) for f in all_files]

result = pd.concat(dfs, ignore_index=True)

# 保存为 dta 格式（需要 pyreadstat）

pyreadstat.write_dta(result, output_file)

import pyreadstat

print(f"结果已保存为: {output_file}")

except ImportError:

# 如果没有 pyreadstat，保存为 CSV

result.to_csv(output_file.replace('.dta', '.csv'), index=False)

print(f"结果已保存为 CSV 格式")

return result

# 合并结果

final_result = merge_results("resb", "2000～2024年各城市多中心指标（商玉萍版本）.dta")

print(final_result.head())

最终结果数据集包含如下变量：

变量名	说明
`year`	年份（2000～2024）
`city_code`	行政区划代码（2021 年版）
`city_name`	城市名称
`center`	有效城市中心数量
`pareto`	帕累托指数，越小越均衡
`poly`	含距离的多中心指数，越小越均衡
`sub3`	CBD 3 千米以外的人口占比
`sub5`	CBD 5 千米以外的人口占比
`total_pop`	城市总人口

通常，pareto、poly 值越小，代表各中心的人口分布越均衡（均等）；sub3、sub5 越大，代表城市人口越去中心化。

七、稳健性检验：放宽中心人口门槛至 1 万人

商玉萍（2022）论文中提供了一项稳健性检验：将"总人口在 10 万人以上"的条件改为"总人口在 1 万人以上"，重新确定每个城市的中心数量（2center）作为稳健性指标。

这一操作的实现方式非常简单，只需将 ANALYSIS_PARAMS['min_pop'] 改为 10000，其余代码完全不变：