【发布时间】:2018-09-18 15:10:43
【问题描述】:
我正在将数千个 XML 文件解析成字典,并将它们的结构存储在 JSON 中。
它们有很多相同的结构,但有未知数量的不同标签命名方案。在这数千个文件中,存在各种不同的命名标签缩写。
我需要找出存在多少个不同的标签来描述每条信息,以便正确解析它们。
为此,我想创建一个 XML/字典的主字典,其中包含标记名称的所有变体,最好是它们在数千个 XML/字典中的计数。
这是其中一本词典的一小部分示例:
{
"Header": {
"Ts": {},
"PeriodEndDt": {},
"PreparedBy": {
"PreparerID": {},
"PreparerFirmName": {
"BusinessNameLine1Txt": {}
},
"PreparerAddress": {
"AddLn1Txt": {},
"CityName": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormTypeCd": {},
"PeriodBeginDt": {},
"Filer": {
"UniqueID": {},
"BusinessName": {
"BusinessNameLine1Txt": {}
},
"BusinessNameControlTxt": {},
"PhoneNum": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormData": {
"FormCodeType": {
"BizType": {},
"AssetsAtEOY": {},
"AccountingMethod": {},
"RevenueAndExpenses": {
"ScheduleBNotReqd": {},
"DivsRevAndExpenses": {},
"DivsNetInvstIncomeAmt": {},
"NetGainSaleAstRevAndExpnssAmt": {},
"RevsOvrExpenses": {},
"NetInvestmentIncomeAmt": {}
},
"BalanceSheetGroup": {
"CashInvstBOYAmt": {},
"CashInvstEOYAmt": {},
"CashInvstEOYFMVAmt": {},
"OtherInvestmentsBOYAmt": {},
"OtherInvestmentsEOYAmt": {},
"CapitalStockEOYAmt": {},
"TotalLiabilitiesNetAstEOYAmt": {}
},
"ChangeNetAssetsFundGroup": {
"NetAssettFundBalancesBOYAmt": {},
"ExcessRevExpensesAmt": {},
"OtherIncreasesAmt": {},
"SubtotalAmt": {},
"OtherDecreasesAmt": {},
"TotNetAstOrFundBalancesEOYAmt": {}
},
"CapGainsLossTxInvstIncmDetail": {
"CapGainsLossTxInvstIncmGrp": {
"PropertyDesc": {},
"HowAcquiredCd": {},
"GrossSalesPriceAmt": {},
"GainOrLossAmt": {},
"GainsMinusExcessOrLossesAmt": {}
},
"StatementsRegardingActyGrp": {
"LegislativePoliticalActyInd": {},
"MoreThan100SpentInd": {}
},
"PhoneNum": {},
"LocationOfBooksUSAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"CorporateDirectorsGrp": {
"DirectorsGrp": {
"PersonNm": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"EmpPrograms": {
"EmployeeBenefitGroupNum": {},
"GroupType": {
"GroupElement": {},
"GroupCharacter": {
"GroupNames": {}
}
}
},
"EmpOffice1": {},
"EmpOffice2": {},
"EmpOffice3": {},
"EmpOffice4": {}
}
}
}
}
}
}
}
我首先用来创建字典/JSON 的代码是这样的:
import xml.etree.ElementTree as ET
strip_ns = lambda xx: str(xx).split('}', 1)[1]
tree = ET.parse('xmlpath.xml')
root = tree.getroot()
tierdict = {}
for tier1 in root:
tier1var = strip_ns(tier1.tag)
tierdict[tier1var] = {}
for tier2 in tier1:
tier2var = strip_ns(tier2.tag)
tierdict[tier1var][tier2var] = {}
for tier3 in tier2:
tier3var = strip_ns(tier3.tag)
tierdict[tier1var][tier2var][tier3var] = {}
for tier4 in tier3:
tier4var = strip_ns(tier4.tag)
tierdict[tier1var][tier2var][tier3var][tier4var] = {}
我想看到的输出是这样的:
{
"Header": {
"Header.Count": 5672,
"Ts": {
"Ts.Count": 3365
},
"Ss": {
"Ss.Count": 2328
},
【问题讨论】:
标签: python json xml dictionary elementtree