用python比较方便- import requests
- import time
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
- from pypinyin import pinyin, lazy_pinyin, Style
-
- def get_lower_pinyin(string):
- pinyin_list = lazy_pinyin(string, style=Style.NORMAL)
- lower_pinyin = ''.join(pinyin_list)
- return lower_pinyin
-
- def get_link_text(url):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
- }
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- response.encoding = 'UTF8' # Set the correct encoding
- html = response.text
- soup = BeautifulSoup(html, 'html.parser')
- td_elements = soup.find_all('td')
- for td in td_elements:
- if td.find('a') and not td.attrs:
- links = td.find_all('a')
- for link in links:
- link_text = link.get_text()
- if not link_text.isdigit():
- absolute_url = urljoin(url, link['href'])
- print(link_text)
- with open(file_path, 'a') as file:
- file.write(link_text+","+get_lower_pinyin(link_text)+"\n")
- time.sleep(0.5)
- get_link_text(absolute_url)
-
- url = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/"
- file_path = r"r:\2.csv"
- get_link_text(url)
复制代码
|