python常用脚本
excel转metadata
pip install openpyxl
python excel2metadata.py report.xlsx report.json
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel 元数据提取工具(终极兼容版)
支持:公式、数组公式、下拉列表、批注、超链接
修复:id 参数、MultiCellRange、ArrayFormula 等 openpyxl 兼容问题
"""
# ===== 补丁 1:修复 DataValidation 的 id/xr:uid 问题 =====
try:
from openpyxl.worksheet.datavalidation import DataValidation
_orig_init = DataValidation.__init__
def _patched_init(self, *args, **kwargs):
kwargs.pop('id', None)
kwargs.pop('xr:uid', None)
return _orig_init(self, *args, **kwargs)
DataValidation.__init__ = _patched_init
except Exception:
pass
# =========================================================
import sys
import json
import argparse
from openpyxl import load_workbook
def extract_excel_metadata(file_path):
try:
wb = load_workbook(filename=file_path, data_only=False)
except Exception as e:
raise RuntimeError(f"无法打开 Excel 文件 '{file_path}':{e}")
output = {"sheets": {}}
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
sheet_data = {
"cells": {},
"data_validations": {}
}
for row in ws.iter_rows():
for cell in row:
if (cell.value is None
and cell.data_type != 'f'
and cell.comment is None
and cell.hyperlink is None):
continue
coord = cell.coordinate
info = {}
# === 修复点:安全处理 ArrayFormula ===
if cell.data_type == 'f':
# 普通公式是 str,数组公式是 ArrayFormula 对象
if hasattr(cell.value, 'text'):
info["formula"] = cell.value.text
else:
info["formula"] = str(cell.value)
else:
info["value"] = str(cell.value) if cell.value is not None else ""
# =====================================
if cell.comment:
info["comment"] = cell.comment.text
if cell.hyperlink:
info["hyperlink"] = cell.hyperlink.target
if info:
sheet_data["cells"][coord] = info
# 提取数据验证
for dv in ws.data_validations.dataValidation:
sqref_str = str(dv.sqref) if dv.sqref else ""
ranges = sqref_str.split()
for rng in ranges:
dv_info = {
"type": dv.type,
"operator": getattr(dv, 'operator', None),
"allow_blank": bool(getattr(dv, 'allowBlank', False)),
"show_input_message": bool(getattr(dv, 'showInputMessage', False)),
"show_error_message": bool(getattr(dv, 'showErrorMessage', False)),
}
if dv.formula1 is not None:
dv_info["formula1"] = dv.formula1
if dv.type == "list":
f1 = dv.formula1
if f1 and not f1.startswith("="):
clean_f1 = f1.strip('"')
if ',' in clean_f1:
dv_info["options"] = [opt.strip() for opt in clean_f1.split(',')]
else:
dv_info["options"] = [clean_f1]
else:
dv_info["source_range"] = f1
if dv.formula2 is not None:
dv_info["formula2"] = dv.formula2
sheet_data["data_validations"][rng] = dv_info
output["sheets"][sheet_name] = sheet_data
return output
def main():
parser = argparse.ArgumentParser(description="Excel 元数据导出工具(含公式、下拉、批注)")
parser.add_argument("input", help="输入的 .xlsx 文件路径")
parser.add_argument("output", nargs="?", default=None, help="输出 JSON 路径(可选)")
args = parser.parse_args()
try:
data = extract_excel_metadata(args.input)
json_str = json.dumps(data, indent=4, ensure_ascii=False)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(json_str)
print(f"✅ 已保存到: {args.output}")
else:
print(json_str)
except Exception as e:
print(f"❌ 错误: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()