主题匹配存在一些问题,因此您需要扩展我使用的 match_topic()
方法,但我添加了一些逻辑以查看最后不匹配的内容。
results
变量包含一个 dict 列表,您可以轻松地将其保存为 JSON 文件。
检查内联注释以了解我使用的逻辑的推理。
旁注:
如果我是你,我会稍微重构一下 JSON。将 topic
作为键/值对放在 GPO
和 CAP
键下对我来说比使用带有单独 Topic
和 GPO
键的 CAP
键更有意义import csv
from pprint import pprint
import json
# load gpo_full.csv into a list of dict using
# csv.DictReader & list comprehension
with open("path/to/file/gpo_full.csv") as infile:
gpo_full = [item for item in csv.DictReader(infile)]
# do the same for CAP_cols.csv
with open("path/to/file/CAP_cols.csv") as infile:
cap_cols = [item for item in csv.DictReader(infile)]
def match_topic(gpo_topic: str,cap_topic: str) -> bool:
"""We need a function as some of the mapping is not simple
Args:
gpo_topic (str): gpo topic
cap_topic (str): CAP topic
Returns:
bool: True if topics match
"""
# this one is simple
if gpo_topic in cap_topic:
return True
# you need to repeat the below conditional check
# for each custom topic matching
elif gpo_topic == "weather" and cap_topic == "rain & cloudy":
return True
# example secondary topic matching
elif gpo_topic == "foo" and cap_topic == "bar":
return True
# finally return false for no matches
return False
# we need this later
gpo_length = len(gpo_full)
results = []
cap_left_over = []
# do the actual mapping
# this could've been done above,but I separated it intentionally
for cap in cap_cols:
found = False
# first find the corresponding gpo
for index,gpo in enumerate(gpo_full):
if (
gpo["Specific_Date"] == cap["Specific_Date"] # check by date
and match_topic(gpo["topic"],cap["topic"]) # check if topics match
):
results.append({
"Date": gpo["Date"],"Specific_Date": gpo["Specific_Date"],"Topic": {
"GPO": gpo["topic"],"CAP": cap["topic"]
},"GPO": {
"hearing_sub_type": gpo["hearing_sub_type"]
},"CAP": {
"majortopic": cap["majortopic"],"id": cap["id"],"Chamber": cap["Chamber"]
}
})
# pop & break to remove the gpo item
# this is so you're left over with a list of
# gpo items that didn't match
# it also speeds up further matches
gpo_full.pop(index)
found = True
break
# this is to check if there's stuff left over
if not found:
cap_left_over.append(cap)
with open('path/to/file/combined_json.json','w') as outfile:
json.dump(results,outfile,indent=4)
pprint(results)
print(f'\nLength:\n Results: {len(results)}\n CAP: {len(cap)}\n GPO: {gpo_length}')
print('\nLeftover GPO:')
pprint(gpo_full)
print('\nLeftover CAP:')
pprint(cap_left_over)
键/值对...
pprint(results)
输出
我已从输出中删除了 Length:
Results: 5
CAP: 6
GPO: 7
Leftover GPO:
[{'Date': 'April,2001','Specific_Date': 'NaN ','hearing_sub_type': 'Oversight','topic': 'people'},{'Date': 'June,2000','Specific_Date': 'June 6,'topic': 'depressed'}]
Leftover CAP:
[{'Chamber': '2','Date': 'June,'id': '79847','majortopic': '4','topic': 'emotion'},{'Chamber': '1','Date': 'May,'Specific_Date': 'NaN','id': '79848','majortopic': '13','topic': 'NaN'}]
,请进一步查看 JSON
Date,hearing_sub_type,topic,Specific_Date
"January,1997",Oversight,weather,"January 12,1997"
"June,2000",General,life,"June 5,2000"
"January,forest,"January 1,1997"
"April,2001",people,NaN
"June,depressed,"June 6,2000"
path/to/file/gpo_full.csv
majortopic,id,Chamber,Date,Specific_Date
21,79846,1,many forest,"January,1997"
4,79847,2,emotion,"June,2000"
13,79848,NaN,"May,"NaN"
7,79849,good life,2000"
21,79850,good weather,1997"
25,79851,rain & cloudy,1997"
6,79852,sad & depressed,2000"
path/to/file/CAP_cols.csv
[
{
"Date": "January,"Specific_Date": "January 1,"Topic": {
"GPO": "forest","CAP": "many forest"
},"GPO": {
"hearing_sub_type": "General"
},"CAP": {
"majortopic": "21","id": "79846","Chamber": "1"
}
},{
"Date": "June,"Specific_Date": "June 5,"Topic": {
"GPO": "life","CAP": "good life"
},"CAP": {
"majortopic": "7","id": "79849","Chamber": "2"
}
},{
"Date": "January,"Topic": {
"GPO": "weather","CAP": "good weather"
},"id": "79850","Specific_Date": "January 12,"CAP": "rain & cloudy"
},"GPO": {
"hearing_sub_type": "Oversight"
},"CAP": {
"majortopic": "25","id": "79851","Topic": {
"GPO": "depressed","CAP": "sad & depressed"
},"CAP": {
"majortopic": "6","id": "79852","Chamber": "2"
}
}
]
path/to/file/combined_json.json
subset