Source code for ips_python.refinement

import time
from os.path import join

import pandas as pd

try:
    from ips_python.utils import get_data_path
    from ips_python.constants import (
        INPUT_DATA_FILENAME,
        COSINE_FILENAME,
        IATI_IDENTIFIER_COLUMN_NAME,
        TITLE_COLUMN_NAME,
        DESCRIPTION_COLUMN_NAME,
        ORG_ID_COLUMN_NAME,
        IATI_FIELDS,
    )
    from ips_python.preprocessing import preprocessing_initial_text_clean
except ModuleNotFoundError:
    from utils import get_data_path
    from constants import (
        INPUT_DATA_FILENAME,
        COSINE_FILENAME,
        IATI_IDENTIFIER_COLUMN_NAME,
        TITLE_COLUMN_NAME,
        DESCRIPTION_COLUMN_NAME,
        ORG_ID_COLUMN_NAME,
        IATI_FIELDS,
    )
    from preprocessing import preprocessing_initial_text_clean


[docs]def process_results(initial_result_df, full_iati_records, number_of_results=100): """ This is an example of Google style. Args: param1: This is the first param. param2: This is a second param. Returns: This is a description of what is returned. Raises: KeyError: Raises an exception. """ start_time = time.time() keep_columns = IATI_FIELDS full_iati_df = full_iati_records[keep_columns] print("select columns after {} seconds".format(time.time() - start_time)) # Select unique record on all fields but iati.identifier, include iati.identifer of 1st record in duplicate set (!!!) make_unique = [f for f in keep_columns if f != IATI_IDENTIFIER_COLUMN_NAME] full_iati_df.drop_duplicates(subset=make_unique, keep="first") print("duplicates dropped after {} seconds".format(time.time() - start_time)) full_iati_df = full_iati_df.merge( initial_result_df, on=IATI_IDENTIFIER_COLUMN_NAME, how="inner" ) print("joined cosine res after {} seconds".format(time.time() - start_time)) full_iati_df.sort_values(by="cosine_sim", ascending=False, inplace=True) print("sorted by res after {} seconds".format(time.time() - start_time)) full_iati_df = full_iati_df.head(number_of_results) print("limited after {} seconds".format(time.time() - start_time)) """further filtering ideas remove results with null description e.g. matched on title alone""" return full_iati_df
def remove_white_space(refined_res, p_text): # remove extra spaces and spaces at the end of string from reporting.org column refined_res[p_text] = (refined_res[p_text].str.split()).str.join(" ") refined_res[p_text] = refined_res[p_text].str.rstrip() return refined_res def gather_top_results(post_processed_results, org_name, number_of_results_per_org): start_time = time.time() # remove duplicate entries post_processed_results.drop_duplicates( subset=[org_name, TITLE_COLUMN_NAME, DESCRIPTION_COLUMN_NAME] ) # set order for top reporting organisations myorder = pd.Series(post_processed_results[org_name], name="A").unique() sorterIndex = dict(zip(myorder, range(len(myorder)))) post_processed_results["myorder"] = post_processed_results[org_name].map( sorterIndex ) # group entries by reporting.org, taking top entries top_project_results = post_processed_results.groupby("myorder").head( number_of_results_per_org ) # Order by top organisation and within each top organisation the top projects top_project_results = top_project_results.sort_values( ["myorder", "cosine_sim"], ascending=[True, False] ) top_project_results = top_project_results.drop(["myorder"], axis=1) print("limited after {} seconds".format(time.time() - start_time)) return top_project_results if __name__ == "__main__": full_df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME), encoding="utf-8") cosine_res_df = pd.read_csv( join(get_data_path(), COSINE_FILENAME), encoding="utf-8" ) refined_res = process_results(cosine_res_df, full_df, 100) refined_res = preprocessing_initial_text_clean(refined_res, ORG_ID_COLUMN_NAME) refined_res = remove_white_space(refined_res, ORG_ID_COLUMN_NAME) refined_res = remove_white_space(refined_res, DESCRIPTION_COLUMN_NAME) # top results per reporting organisation top_project_results = gather_top_results(refined_res, ORG_ID_COLUMN_NAME, 3)