# 拿到页面源代码 requests# 通过re来提取想要的有效信息 reimport requestsimport reimport csvurl = "https://movie.douban.com/top250?start=0&filter="headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36 Core/1.116.554.400 QQBrowser/19.5.6663.400"}resp = requests.get(url, headers=headers)page_content = resp.text# 解析数据obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?\.\.\.<br>(?P<years>.*?)' r' / (?P<place>.*?) / .*?<span class="rating_num" property="v:average">' r'(?P<score>.*?)</span>.*?<span property="v:best" content="10.0"></span>.*?<span>' r'(?P<num>.*?)</span>', re.S) # 代码太长,进行换行处理,新的一行r''是自动生成# 开始匹配result = obj.finditer(page_content)f = open("date.csv", mode="w", encoding="utf-8")csvwriter = csv.writer(f)for it in result: # print(it.group("name")) # print(it.group("years").strip()) # print(it.group("place")) # print(it.group("score")) # print(it.group("num")) dic = it.groupdict() # 循环遍历每个匹配项,it.groupdict()直接将命名捕获组转换为字典,键为组名(name, years等),值为匹配到的字符串 dic['years'] = dic['years'].strip() # 对年份字段进行了去除两端空白字符的处理。 csvwriter.writerow(dic.values()) # 将字典的值按顺序写入CSV文件的一行f.close()print("over!")