Sheet-based curation utilities
async_search_wikidata(search_term, session, excluded_types=None, fixed_type=None, exclude_basic=False)
async
Looks up string on Wikidata.
Returns a nested dictionary with the search term as key and the associated results as value
Source code in wdcuration/sheet_based_curation.py
async def async_search_wikidata(
search_term: str,
session: ClientSession,
excluded_types: List[str] = None,
fixed_type: str = None,
exclude_basic: bool = False,
):
"""
Looks up string on Wikidata.
Returns a nested dictionary with the search term as key and the associated results as value
"""
if excluded_types is None:
excluded_types = []
elif not isinstance(excluded_types, list):
raise TypeError("excluded_types must be a list")
# Note: for some reason, adding the "haswbstatement" bits messes up with the ranking of the results.
basic_exclusion = BASIC_EXCLUSION
excluded_types_local = excluded_types
# Workaround to avoid accumulation. Not sure why, but they are accumulating.
excluded_types_local = list(set(excluded_types_local))
search_expression = search_term
if exclude_basic:
for excluded_type in basic_exclusion:
excluded_types_local.append(excluded_type) # Disambiguation page
for excluded_type in excluded_types_local:
search_expression += f" -haswbstatement:P31={excluded_type} "
if fixed_type is not None:
search_expression += f" haswbstatement:P31={fixed_type} "
base_url = "https://www.wikidata.org/w/api.php?"
payload = {
"action": "query",
"list": "search",
"srsearch": search_expression,
"language": "en",
"format": "json",
"origin": "*",
}
for k, v in payload.items():
base_url += f"&{k}={v}"
url = base_url.replace("?&", "?")
async with session.request("GET", url) as response:
# Raise if the response code is >= 400.
# Some 200 codes may still be "ok".
# You can also pass raise_for_status within
# client.request().
response.raise_for_status()
# Let your code be fully async. The call to json.loads()
# is blocking and won't take full advantage.
#
# And it does largely the same thing you're doing now:
# https://github.com/aio-libs/aiohttp/blob/76268e31630bb8615999ec40984706745f7f82d1/aiohttp/client_reqrep.py#L985
j = await response.json()
parsed_result = await async_parse_result(j, session)
return {search_term: parsed_result}
generate_curation_spreadsheet(identifiers_property, curation_table_path, output_file_path, description_term_lookup='', fixed_type=None, excluded_types=None, drop_nones=True, exclude_basic=False, overwrite=True)
Generates a curation spreadsheet based on input data, filtering and searching for Wikidata entries.
This function operates on a Mix'n'match-like spreadsheet, which should contain at least "name" and "id" columns. It enriches this spreadsheet with additional Wikidata information based on the given parameters.
| Parameters: |
|
|---|
| Returns: |
|
|---|
Source code in wdcuration/sheet_based_curation.py
def generate_curation_spreadsheet(
identifiers_property,
curation_table_path: str,
output_file_path: str,
description_term_lookup: str = "",
fixed_type: str = None,
excluded_types: List[str] = None,
drop_nones: bool = True,
exclude_basic: bool = False,
overwrite: bool= True
):
"""
Generates a curation spreadsheet based on input data, filtering and searching for Wikidata entries.
This function operates on a Mix'n'match-like spreadsheet, which should contain at least "name" and "id" columns.
It enriches this spreadsheet with additional Wikidata information based on the given parameters.
Args:
identifiers_property: The identifier property used for Wikidata searching.
curation_table_path: Path to the input spreadsheet to be curated.
output_file_path: Path where the curated spreadsheet will be saved.
description_term_lookup (str, optional): A term to filter the input table based on the "description" column.
fixed_type (str, optional): A fixed type to filter the Wikidata search results.
excluded_types (list of str, optional): Types to exclude from the Wikidata search results.
drop_nones (bool, optional): If True, rows without a Wikidata ID will be dropped.
exclude_basic (bool, optional): If True, basic types will be excluded from the Wikidata search.
overwrite (bool, optional): If False, code will check for the existence of a previous target file and keep it.
Returns:
None: The function outputs the curated spreadsheet to the specified file path.
"""
if not overwrite and os.path.isfile(output_file_path):
print(f"Target file '{output_file_path}' already exists. Skipping generation.")
return
if excluded_types is None:
excluded_types = []
elif not isinstance(excluded_types, list):
raise TypeError("excluded_types must be a list")
not_on_wikidata = get_subset_not_on_wikidata(
identifiers_property, curation_table_path, description_term_lookup
)
p = inflect.engine()
search_terms_dict = {}
search_terms = []
for i, row in not_on_wikidata.iterrows():
search_term = p.singular_noun(row["name"])
if not search_term:
search_term = row["name"]
search_terms.append(search_term)
search_terms_dict[row["name"]] = search_term
n_per_batch = 25
list_of_search_lists = list(
divide_in_chunks_of_equal_len(search_terms, n_per_batch)
)
results = {}
print(f"Running {str(len(search_terms))} searches in batches of {n_per_batch}")
for group_of_search_terms in tqdm(
list_of_search_lists, total=len(list(list_of_search_lists))
):
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(
run_multiple_searches(
group_of_search_terms,
fixed_type=fixed_type,
excluded_types=excluded_types,
exclude_basic=exclude_basic,
)
)
results_now = loop.run_until_complete(future)
results.update(results_now)
time.sleep(1)
not_on_wikidata["search_term"] = not_on_wikidata["name"].map(search_terms_dict)
not_on_wikidata["wikidata_id"] = not_on_wikidata["search_term"].map(
{k: v["id"] for k, v in results.items()}
)
not_on_wikidata["wikidata_label"] = not_on_wikidata["search_term"].map(
{k: v["label"] for k, v in results.items()}
)
not_on_wikidata["wikidata_description"] = not_on_wikidata["search_term"].map(
{k: v["description"] for k, v in results.items()}
)
if drop_nones:
not_on_wikidata = not_on_wikidata[not_on_wikidata["wikidata_id"] != "NONE"]
not_on_wikidata = not_on_wikidata.drop_duplicates()
not_on_wikidata.to_csv(output_file_path, index=False)
get_quickstatements_for_curated_sheet(curated_sheet_path, wikidata_property, dropnas=False, add_name_as_alias=True, alias_lang='en')
Gets a quickstatements from an standardized curation sheet.
| Parameters: |
|
|---|
Source code in wdcuration/sheet_based_curation.py
def get_quickstatements_for_curated_sheet(
curated_sheet_path, wikidata_property, dropnas=False, add_name_as_alias=True, alias_lang = "en"
):
"""
Gets a quickstatements from an standardized curation sheet.
Args:
curated_sheet_path (str): The path to the sheet of interest.
wikidata_property (str): The PID of the property to use on Quickstatements.
dropnas (bool): Whether or not a curation column labeled "ok_row_ was added.
If true, will dropnas in the column. Useful when good matches are rare.
add_aliases (bool)
"""
df = pd.read_csv(curated_sheet_path, dtype={"id": object})
if dropnas:
df = df.dropna(subset=["ok_row"])
qs = ""
for i, row in df.iterrows():
if row["wikidata_id"] != "NONE":
database_id = row["id"]
database_p_id = wikidata_property
wikidata_id = row["wikidata_id"]
database_label = row["name"]
qs += f'{wikidata_id}|{database_p_id}|"{database_id}"' + "\n"
if add_name_as_alias:
qs += f'{wikidata_id}|A{alias_lang}|"{database_label}"' + "\n"
return qs