first version

This commit is contained in:
Bigsk 2023-02-25 01:27:27 +08:00
parent 2e1bdec5e0
commit 963ef1e82c
4 changed files with 343 additions and 0 deletions

59
data.py Normal file
View File

@ -0,0 +1,59 @@
import json
import threading
import sys
import requests
def main():
weather_urls = [
"http://www.tianqihoubao.com/lishi/jiaxing/month/{}{}.html".format(
year, str(month) if len(str(month)) == 2 else "0{}".format(month)
)
for month in range(1, 13) for year in range(2014, 2023)
]
aqi_urls = [
"http://tianqihoubao.com/aqi/jiaxing-{}{}.html".format(
year, str(month) if len(str(month)) == 2 else "0{}".format(month)
)
for month in range(1, 13) for year in range(2014, 2023)
]
urls = [*weather_urls, *aqi_urls]
data = {}
def add(url):
while True:
try:
d = requests.get(url, headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}, timeout=10).text
except:
print(f"{url} Failed... Retrying...")
else:
print(f"{url} Done")
data[url]=d
break
for url in urls:
print(f"Getting {url}")
thread = threading.Thread(target=add, args=(url,), name=url)
thread.daemon = True
thread.start()
while len(data) != len(urls):
for url in urls:
if url not in data.keys():
print(url)
print(len(data), "Done")
c = input()
if c == "save":
with open("result.json", "w+") as fb:
fb.write(json.dumps(data))
sys.exit(0)
with open("result.json", "w+") as fb:
fb.write(json.dumps(data))
if __name__ == "__main__":
main()

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
requests
bs4
pandas

1
result.json Normal file

File diff suppressed because one or more lines are too long

280
solve.py Normal file
View File

@ -0,0 +1,280 @@
import json
import os
from bs4 import BeautifulSoup
import pandas as pd
def main():
os.makedirs("output", exist_ok=True)
with open("result.json", "r") as fb:
data = json.loads(fb.read())
aqi_data = {}
weather_data = {}
for key, value in data.items():
if "aqi" in key:
aqi_data[key] = value
else:
weather_data[key] = value
# AQI
for key, value in aqi_data.items():
soup = BeautifulSoup(value, 'html.parser')
# Get data area
d = soup.find("div", id="content").select(".api_month_list")[0].table.select("tr")
# Head
head = [str(i).strip("<td>\n<b>").strip("</b></td>") for i in d[0].select("td")]
df = pd.DataFrame([], columns=head)
for fd in d[1:]:
fds = fd.select("td")
detail = []
for i in fds:
temp = str(i)\
.replace("</td>", "").replace("<td>", "")\
.replace('<td class="aqi-lv1">\r\n', "")\
.replace('<td class="aqi-lv2">\r\n', "")\
.replace('<td class="aqi-lv3">\r\n', "")\
.replace('<td class="aqi-lv4">\r\n', "")\
.replace('<td class="aqi-lv5">\r\n', "")\
.replace('<td class="aqi-lv6">\r\n', "")\
.strip()
try:
temp = float(temp)
except:
pass
detail.append([temp])
df = df.append(pd.DataFrame(dict(zip(
head, detail
))), ignore_index=True)
name = int(key\
.replace('http://tianqihoubao.com/aqi/jiaxing-', "")\
.replace(".html", ""))
os.makedirs(os.path.join("output", f"{name // 100}"), exist_ok=True)
path = os.path.join("output", f"{name // 100}", "{}月.xls".format(name % 100))
print(f"{path} Done")
df.to_excel(path)
# Weather
for key, value in weather_data.items():
soup = BeautifulSoup(value, 'html.parser')
# Get data area
d = soup.find("div", id="content").select(".b")[0].select("tr")
# Head
head = [str(i).strip("<td>\n<b>").strip("</b></td>") for i in d[0].select("td")]
df = pd.DataFrame([], columns=head)
for fd in d[1:]:
fds = fd.select("td")
detail = []
for i in fds:
if "" in str(i):
temp = str(i)[115:].strip("</td>").strip().strip("</a>").strip()
elif "/" in str(i):
temp = str(i).strip().strip("<td>").strip("</td>").strip().replace(" ", "").replace("\r\n", "").split("/")
else:
temp = str(i)
if type(temp) is list:
temp = [i.replace("&lt;", "<").replace("", "-") for i in temp]
else:
temp = temp.replace("&lt;", "<").replace("", "-")
if type(temp) is list and "" in temp[0]:
temp = [int(i.replace("", "")) for i in temp]
detail.append([temp])
df = df.append(pd.DataFrame(dict(zip(
head, detail
))), ignore_index=True)
df["天气1"] = [""] * len(df.index)
df["天气2"] = [""] * len(df.index)
for i in range(len(df.index)):
df.loc[i, "天气1"] = df.at[i, "天气状况"][0]
df.loc[i, "天气2"] = df.at[i, "天气状况"][1]
df = df.drop(labels='天气状况', axis=1)
df["最高气温"] = [""] * len(df.index)
df["最低气温"] = [""] * len(df.index)
for i in range(len(df.index)):
df.loc[i, "最低气温"] = int(df.at[i, "最低气温/最高气温"][0])
df.loc[i, "最高气温"] = int(df.at[i, "最低气温/最高气温"][1])
df = df.drop(labels='最低气温/最高气温', axis=1)
df["白天风向"] = [""] * len(df.index)
df["夜晚风向"] = [""] * len(df.index)
df["白天风力1"] = [""] * len(df.index)
df["白天风力2"] = [""] * len(df.index)
df["夜晚风力1"] = [""] * len(df.index)
df["夜晚风力2"] = [""] * len(df.index)
for i in range(len(df.index)):
if "无持续" not in str(df.at[i, "风力风向(夜间/白天)"][0]) and "无持续" not in str(df.at[i, "风力风向(夜间/白天)"][1]) :
if "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[1])
elif "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" not in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
elif "-" not in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[1])
else:
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
elif "无持续" not in str(df.at[i, "风力风向(夜间/白天)"][0]) and "无持续" in str(df.at[i, "风力风向(夜间/白天)"][1]) :
if "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[1])
elif "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" not in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
elif "-" not in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[1])
else:
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("")[1].replace("", "")
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
elif "无持续" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "无持续" not in str(df.at[i, "风力风向(夜间/白天)"][1]) :
if "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[1])
elif "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" not in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
elif "-" not in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "").split("-")[1])
else:
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("")[1].replace("", "")
else:
if "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[1])
elif "-" in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" not in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
df.loc[i, "白天风力2"] = int(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "").split("-")[1])
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
elif "-" not in str(df.at[i, "风力风向(夜间/白天)"][0]) and "-" in str(df.at[i, "风力风向(夜间/白天)"][1]):
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力1"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[0])
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力2"] = int(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "").split("-")[1])
else:
df.loc[i, "白天风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][0].split("风向")[0])
df.loc[i, "夜晚风向"] = "{}".format(df.at[i, "风力风向(夜间/白天)"][1].split("风向")[0])
df.loc[i, "白天风力1"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力1"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
df.loc[i, "白天风力2"] = df.at[i, "风力风向(夜间/白天)"][0].split("风向")[1].replace("", "")
df.loc[i, "夜晚风力2"] = df.at[i, "风力风向(夜间/白天)"][1].split("风向")[1].replace("", "")
df = df.drop(labels='风力风向(夜间/白天)', axis=1)
name = int(key\
.replace('http://www.tianqihoubao.com/lishi/jiaxing/month/', "")\
.replace(".html", ""))
os.makedirs(os.path.join("output", f"{name // 100}"), exist_ok=True)
path = os.path.join("output", f"{name // 100}", "{}月.xls".format(name % 100))
if os.path.exists(path):
df_o = pd.read_excel(path)
df_c = pd.merge(df, df_o)
for i in df_o.columns:
df_c[i] = df_o[i]
for i in range(len(df.index)):
for j in range(len(df_c.index)):
year = str(df.at[i, "日期"]).split("")[0]
month = str(df.at[i, "日期"]).split("")[1].split("")[0]
day = str(df.at[i, "日期"]).split("")[1].split("")[1].replace("", "")
if len(month) == 1:
month = f"0{month}"
if len(day) == 1:
day = f"0{day}"
if str(df_c.at[j, "日期"]) == "{}-{}-{}".format(year, month, day):
for k in df.columns:
if str(k) != "日期":
df_c.loc[j, str(k)] = df.at[i, str(k)]
df_c = df_c.drop(labels="Unnamed: 0", axis=1)
print(f"{path} Done")
df_c.to_excel(path)
else:
print(f"{path} Done")
df.to_excel(path)
if __name__ == "__main__":
main()