Wikidata searches using SPARQL

Wikidata lookups via SPARQL

Detects and returns pairs from a list of Wikidata QIDs with links to each other. The base link if "wdt:P279" which covers indirect and direct subclasses (P279).

Parameters:
  • list_of_qids (list) –

    A list of Wikidata QIDs.

  • link_phrase (str, default: 'wdt:P279*' ) –

    The link to be searched between the entities. Defaults to "wdt:P279*"

Source code in wdcuration/sparql.py
def detect_direct_links(list_of_qids, link_phrase="wdt:P279*"):
    """Detects and returns pairs from a list of Wikidata QIDs
    with links to each other.
    The base link if "wdt:P279*" which covers indirect and direct subclasses (P279*).

    Args:
      list_of_qids (list): A list of Wikidata QIDs.
      link_phrase (str): The link to be searched between the entities. Defaults to "wdt:P279*"
    """
    clean_list = [x for x in list_of_qids if str(x) != "nan"]

    formatted_qids = "{ wd:" + " wd:".join(clean_list) + "}"
    query = f"""
  SELECT
    (REPLACE(STR(?a_), ".*Q", "Q") AS ?a)
    (REPLACE(STR(?b_), ".*Q", "Q") AS ?b)

  WHERE
{{

  VALUES ?a_ {formatted_qids} .
  VALUES ?b_ {formatted_qids} .
  FILTER (?a_ != ?b_)
  ?a_ {link_phrase}?b_ .
  }}"""
    return query_wikidata(query)

get_statement_values(qid, property, label=False)

Return the values for a Wikidata QID + PID pair as a Python list.

Source code in wdcuration/sparql.py
def get_statement_values(qid, property, label=False):
    """
    Return the values for a Wikidata QID + PID pair as a Python list.
    """

    if label:
        label_projection = "?valueLabel"
        label_line = (
            "?value rdfs:label ?valueLabel . FILTER (LANG (?valueLabel) = 'en')"
        )
    else:
        label_projection = ""
        label_line = ""
    query = f"""
    SELECT ?value {label_projection}
    WHERE
    {{
        wd:{qid} wdt:{property} ?value .
        {label_line}
    }}
    """

    bindings = query_wikidata(query)
    value_list = []
    for binding in bindings:
        if label:
            value_list.append(
                {
                    "id": binding["value"].split("/")[-1],
                    "label": binding["valueLabel"],
                }
            )

        else:
            value_list.append(binding["value"])
    return value_list

get_wikidata_items_for_id(identifier_property)

Returns and ID:QID dictionary for all occurences of a certain identifier on Wikidata. Might time-out for heavily used identifiers.

Parameters:
  • identifier_property (str) –

    The identifier property to be used on Wikidata. E.g. "P7963".

Source code in wdcuration/sparql.py
def get_wikidata_items_for_id(identifier_property):
    """
    Returns and ID:QID dictionary for all occurences of a certain identifier on Wikidata.
    Might time-out for heavily used identifiers.

    Args:
      identifier_property (str): The identifier property to be used on Wikidata. E.g. "P7963".
    """
    existing_terms_output = query_wikidata(
        f'  SELECT DISTINCT ?id   (REPLACE(STR(?item), ".*Q", "Q") AS ?qid)  WHERE {{ ?item wdt:{identifier_property} ?id . }} '
    )

    existing_terms_dict = {}
    for a in existing_terms_output:
        existing_terms_dict[str(a["id"])] = a["qid"]

    return existing_terms_dict

lookup_id(id, property, default='')

Looks up a foreign ID on Wikidata based on its specific property.

Parameters:
  • id (str) –

    The value of the ID as encoded on Wikidata.

  • property (str) –

    The property used to link to that ID .

  • default (str, default: '' ) –

    What to return if no unique ID is present. Defaults to "".

Returns:
  • str( str ) –

    The Wikidata QID for the foreign ID or "".

Source code in wdcuration/sparql.py
def lookup_id(id, property, default="") -> str:
    """
    Looks up a foreign ID on Wikidata based on its specific property.

    Args:
      id (str): The value of the ID as encoded on Wikidata.
      property (str): The property used to link to that ID .
      default (str): What to return if no unique ID is present. Defaults to "".

    Returns:
      str: The Wikidata QID for the foreign ID or "".
    """

    sparql = SPARQLWrapper(
        "https://query.wikidata.org/sparql",
        agent="wdcuration (https://github.com/lubianat/wdcuration)",
    )
    query = f"""
    SELECT ?item ?itemLabel
    WHERE
    {{
        ?item wdt:{property} "{id}" .
    }}
    """
    sparql.setQuery(query)

    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    if len(bindings) == 1:
        item = bindings[0]["item"]["value"].split("/")[-1]
        return item
    else:
        return default

lookup_label(qid, lang='en', default='')

Looks up a label on Wikidata given a QID.

Source code in wdcuration/sparql.py
def lookup_label(qid, lang="en", default=""):
    """
    Looks up a label on Wikidata given a QID.
    """

    sparql = SPARQLWrapper(
        "https://query.wikidata.org/sparql",
        agent="wdcuration (https://github.com/lubianat/wdcuration)",
    )
    query = f"""
    SELECT ?item ?itemLabel
    WHERE
    {{
        {qid} rdfs:label ?itemLabel.
        FILTER (LANG (?itemLabel) = "{lang}")
    }}
    """
    bindings = query_wikidata(query)
    if len(bindings) == 1:
        item = bindings[0]["itemLabel"].split("/")[-1]
        return item
    else:
        return default

lookup_multiple_ids(list_of_ids, wikidata_property, return_type='dict')

Looks up multiple IDs on Wikidata and returns a dict containing them and the QIDs.

Source code in wdcuration/sparql.py
def lookup_multiple_ids(list_of_ids, wikidata_property, return_type="dict"):
    """
    Looks up multiple IDs on Wikidata and returns a dict containing them and the QIDs.
    """
    if len(list_of_ids) > 200:
        list_of_smaller_lists_of_ids = chunk(list_of_ids, 200)
        result_dict = {}
        for small_list in tqdm(list_of_smaller_lists_of_ids):
            current_dict = lookup_multiple_ids(small_list, wikidata_property)
            result_dict.update(current_dict)
            sleep(0.3)

        if return_type == "dict":
            return result_dict
        if return_type == "list":
            return list(result_dict.values())
    formatted_ids = '""'.join(list_of_ids)
    query = (
        """
  SELECT
  (REPLACE(STR(?item), ".*Q", "Q") AS ?qid)
  ?id
  WHERE { """
        f'VALUES ?id {{ "{formatted_ids}" }} . '
        f"?item wdt:{wikidata_property} ?id . "
        """
  }
  """
    )
    query_result = query_wikidata(query)
    result_dict = {}
    for entry in query_result:
        result_dict[entry["id"]] = entry["qid"]
    if return_type == "dict":
        return result_dict
    if return_type == "list":
        return list(result_dict.values())

lookup_value_for_multiple_qids(list_of_qids, wikidata_property, return_type='dict')

Looks up multiple Wikidata QIDs on Wikidata and returns a dict containing them and the values for the property.

Source code in wdcuration/sparql.py
def lookup_value_for_multiple_qids(list_of_qids, wikidata_property, return_type="dict"):
    """
    Looks up multiple Wikidata QIDs on Wikidata and returns a dict containing them and the values for the property.
    """
    if len(list_of_qids) > 200:
        list_of_smaller_lists_of_qids = chunk(list_of_qids, 200)
        result_dict = {}
        for small_list in tqdm(list_of_smaller_lists_of_qids):
            current_dict = lookup_value_for_multiple_qids(small_list, wikidata_property)
            result_dict.update(current_dict)
            sleep(0.3)

        if return_type == "dict":
            return result_dict
        if return_type == "list":
            return list(result_dict.values())
    formatted_qids = format_with_prefix(list_of_qids)

    query = (
        """
  SELECT
  (REPLACE(STR(?item), ".*Q", "Q") AS ?qid)
  ?id
  WHERE { """
        f"VALUES ?item {formatted_qids}. "
        f"?item wdt:{wikidata_property} ?id . "
        """
  }
  """
    )
    query_result = query_wikidata(query)
    result_dict = {}
    for entry in query_result:
        result_dict[entry["qid"]] = entry["id"]
    if return_type == "dict":
        return result_dict
    if return_type == "list":
        return list(result_dict.values())

query_wikidata(query, endpoint='https://query.wikidata.org/sparql', agent='wdcuration (https://github.com/lubianat/wdcuration)', simplify=True)

A simple function to query Wikidata and return a python dictionary

Source code in wdcuration/sparql.py
def query_wikidata(
    query,
    endpoint="https://query.wikidata.org/sparql",
    agent="wdcuration (https://github.com/lubianat/wdcuration)",
    simplify=True,
):
    """A simple function to query Wikidata and return a python dictionary"""
    sparql = SPARQLWrapper(endpoint=endpoint, agent=agent)
    sparql.setQuery(query)

    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]

    if simplify:
        return_value = []

        for binding in bindings:
            entry = {}
            for key, value in binding.items():
                entry[key] = value["value"]
            return_value.append(entry)
        return return_value
    else:
        return bindings