Source code for ips_python.cosine

import time
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from os.path import join

try:
    from ips_python.utils import get_data_path
    from ips_python.preprocessing import preprocess_query_text
    from ips_python.vectorize import vectorize_input_text
    from ips_python.constants import (
        PROCESSED_RECORDS_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        COSINE_FILENAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )
except ModuleNotFoundError:
    from utils import get_data_path
    from preprocessing import preprocess_query_text
    from vectorize import vectorize_input_text
    from constants import (
        PROCESSED_RECORDS_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        COSINE_FILENAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )


[docs]def get_cosine_similarity(
    processed_user_query_vector, term_document_matrix, iati_records
):
    """
    input:
        TDM
        IATI Records used in TDM
        vectorized query

    output:
        cosine similarity > 0 per iati.identifier
    """

    cosine_array = cosine_similarity(term_document_matrix, processed_user_query_vector)

    iati_records = iati_records[[IATI_IDENTIFIER_COLUMN_NAME]]

    iati_records["cosine_sim"] = cosine_array

    # Remove all non-zero results?
    iati_records = iati_records[iati_records["cosine_sim"] > 0]

    return iati_records


if __name__ == "__main__":

    # Test query
    query = """Despite impressive improvements in Vietnam's development and
    health status over the past decade, gains have not been equitable and significant unmet
    health needs remain. Poor and marginalized populations continue to disproportionally
    suffer from preventable illnesses while those in wealthier socioeconomic groups
    continue to enjoy greater health and longer life expectancy. Social Marketing for
    Improved Rural Health will include 3 main components: i) social marketing of SafeWat
    household water treatment solution and promotion of safer hygiene behaviors; ii) Good
    health, Great life and iii) behavior change communication to address non-supply side
    barriers to healthier behaviors."""

    # Or uncomment below if wish to test input text at runtime
    # query = input("Please enter search text:\n")

    query_df = preprocess_query_text(query)

    if not query_df.empty:
        with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file:
            vectorizer = pickle.load(_file)
        query_vector = vectorize_input_text(query_df, vectorizer)

        with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file:
            term_document_matrix = pickle.load(_file)

        iati_records = pd.read_csv(
            join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8"
        )

        iati_records = iati_records[[IATI_IDENTIFIER_COLUMN_NAME]]
        start_time = time.time()
        outDF = get_cosine_similarity(query_vector, term_document_matrix, iati_records)
        print("cosine match in {0} seconds".format(time.time() - start_time))
        # example calling of function for script
        # cosine_similar("tdm.pkl", "vec.pkl", "iati_records", "")

        outDF.to_csv(
            join(get_data_path(), COSINE_FILENAME), index=False, encoding="utf-8"
        )
    else:
        print("no words exist after pre-processing")