import json
import time
import pandas as pd
import requests
import logging
import os
import shutil
# 配置日志记录器,设置日志级别为INFO,记录到当前目录下的data_processing.log文件中,每次运行覆盖原有日志
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='data_processing.log',
filemode='w',
encoding='UTF-8')
# 常量定义,使用全大写字母加下划线的命名方式,便于修改和维护且符合约定
EXCEL_FILE_PATH = 'D:\\Windows\\Desktop\\taxCompare.xlsx'
SHEET_NAME = 'Sheet1'
API_URL = 'https://capi.tianyancha.com/cloud-tempest/search/suggest/company/main'
BACKUP_FILE_SUFFIX = '.bak'
REQUEST_TIMEOUT = 5 # 定义请求超时时间,单位为秒,方便统一修改和查看
API_KEYWORD_PARAM = 'keyword' # 接口请求中企业名称对应的参数名,提取为常量方便维护
# 读取Excel文件,优化异常处理,若文件不存在则记录错误日志并终止程序
try:
df = pd.read_excel(EXCEL_FILE_PATH, sheet_name=SHEET_NAME)
except FileNotFoundError as e:
logging.error(f"读取Excel文件失败,文件 {EXCEL_FILE_PATH} 不存在,错误信息: {str(e)}")
raise
# 获取当前时间戳(毫秒级),用于接口请求参数
current_time = int(round(time.time() * 1000))
# 接口请求头信息,添加注释说明其用途及注意事项,同时可考虑后续将敏感信息从环境变量获取
authToken = "天眼查token"
headers = {
"x-Auth-Token": authToken,
"Content-Type": "application/json",
"x-Tycid": "0c99c4b0be8011ef96107538ca8b55ff",
"Host": "capi.tianyancha.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
"version": "TYC-Web",
"Origin": "https://www.tianyancha.com"
}
# 对请求头的注释说明
"""
headers 字典包含了向天眼查接口发起请求所需的各种请求头信息:
- x-Auth-Token: 接口认证相关的令牌,属于敏感信息,后续可优化为从环境变量获取。
- Content-Type: 指定请求数据的格式为JSON格式。
- 其他字段如x-Tycid、Host、User-Agent等,用于标识请求来源、目标主机以及客户端相关信息等,满足接口要求。
"""
# 备份原Excel文件,防止数据丢失,完善异常处理,记录详细的异常信息
try:
shutil.copy(EXCEL_FILE_PATH, EXCEL_FILE_PATH + BACKUP_FILE_SUFFIX)
logging.info(f"已成功备份Excel文件至 {EXCEL_FILE_PATH + BACKUP_FILE_SUFFIX}")
except Exception as e:
logging.error(f"备份Excel文件时出现异常,详细错误信息: {str(e)},文件路径: {EXCEL_FILE_PATH}")
# 遍历Excel中的每一行进行数据处理,将核心处理逻辑封装到函数中,增强可读性和可维护性
def process_row(index, row):
enterprise_name = row['企业名称']
params = {API_KEYWORD_PARAM: enterprise_name}
try:
# 发起POST请求,设置超时时间,超时则抛出异常并记录详细信息
response = requests.post(API_URL + f'?_={current_time}', headers=headers, json=params, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
data_list = response.json()
company_suggest_list = data_list['data']['companySuggestList']
for data in company_suggest_list:
if enterprise_name == data['comName']:
df.at[index, '国家企业信用信息公示系统统一社会信用代码'] = data['taxCode']
is_tax_code_matched = data['taxCode'] == row['统一社会信用代码']
df.at[index, '是否错误'] = 'Y' if is_tax_code_matched else 'N'
print(f"行数: {index + 1},企业名称: {enterprise_name},税号: {data['taxCode']}")
except requests.RequestException as e:
logging.error(
f"请求出现异常,企业名称: {enterprise_name},异常信息: {str(e)},状态码: {getattr(e, 'response', None) and getattr(e.response, 'status_code', None)}")
except KeyError as e:
logging.error(
f"解析JSON数据时找不到对应键,可能接口返回数据结构变化,企业名称: {enterprise_name},异常信息: {str(e)}")
except json.JSONDecodeError as e:
logging.error(f"解析JSON数据出现异常,企业名称: {enterprise_name},异常信息: {str(e)}")
# 循环调用函数处理每一行数据
for index, row in df.iterrows():
process_row(index, row)
# 将更新后的数据框写回到原Excel文件中,完善异常处理,记录详细的写入异常信息
try:
with pd.ExcelWriter(EXCEL_FILE_PATH, engine='openpyxl', mode='w') as writer:
df.to_excel(writer, sheet_name=SHEET_NAME, index=False)
logging.info("已成功将更新后的数据写回Excel文件")
except Exception as e:
logging.error(f"将数据写回Excel文件时出现异常,详细错误信息: {str(e)},文件路径: {EXCEL_FILE_PATH}")
【仅供学习】读取天眼查接口数据与Excel中的数据对比,并输出结果
于 2024-12-23 09:09:25 首次发布