Source code for ips_python.cosine

import time
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from os.path import join

try:
    from ips_python.utils import get_data_path
    from ips_python.preprocessing import preprocess_query_text
    from ips_python.vectorize import vectorize_input_text
    from ips_python.constants import (
        PROCESSED_RECORDS_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        COSINE_FILENAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )
except ModuleNotFoundError:
    from utils import get_data_path
    from preprocessing import preprocess_query_text
    from vectorize import vectorize_input_text
    from constants import (
        PROCESSED_RECORDS_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        COSINE_FILENAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )


[docs]def get_cosine_similarity( processed_user_query_vector, term_document_matrix, iati_records ): """ input: TDM IATI Records used in TDM vectorized query output: cosine similarity > 0 per iati.identifier """ cosine_array = cosine_similarity(term_document_matrix, processed_user_query_vector) iati_records = iati_records[[IATI_IDENTIFIER_COLUMN_NAME]] iati_records["cosine_sim"] = cosine_array # Remove all non-zero results? iati_records = iati_records[iati_records["cosine_sim"] > 0] return iati_records
if __name__ == "__main__": # Test query query = """Despite impressive improvements in Vietnam's development and health status over the past decade, gains have not been equitable and significant unmet health needs remain. Poor and marginalized populations continue to disproportionally suffer from preventable illnesses while those in wealthier socioeconomic groups continue to enjoy greater health and longer life expectancy. Social Marketing for Improved Rural Health will include 3 main components: i) social marketing of SafeWat household water treatment solution and promotion of safer hygiene behaviors; ii) Good health, Great life and iii) behavior change communication to address non-supply side barriers to healthier behaviors.""" # Or uncomment below if wish to test input text at runtime # query = input("Please enter search text:\n") query_df = preprocess_query_text(query) if not query_df.empty: with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file: vectorizer = pickle.load(_file) query_vector = vectorize_input_text(query_df, vectorizer) with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file: term_document_matrix = pickle.load(_file) iati_records = pd.read_csv( join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8" ) iati_records = iati_records[[IATI_IDENTIFIER_COLUMN_NAME]] start_time = time.time() outDF = get_cosine_similarity(query_vector, term_document_matrix, iati_records) print("cosine match in {0} seconds".format(time.time() - start_time)) # example calling of function for script # cosine_similar("tdm.pkl", "vec.pkl", "iati_records", "") outDF.to_csv( join(get_data_path(), COSINE_FILENAME), index=False, encoding="utf-8" ) else: print("no words exist after pre-processing")