try:
from ips_python.constants import DESCRIPTION_COLUMN_NAME, ORG_ID_COLUMN_NAME
from ips_python.preprocessing import preprocess_query_text
from ips_python.vectorize import vectorize_input_text
from ips_python.cosine import get_cosine_similarity
from ips_python.refinement import process_results, gather_top_results
from ips_python.word2vecaverage import average_per_doc
except ModuleNotFoundError:
from constants import DESCRIPTION_COLUMN_NAME, ORG_ID_COLUMN_NAME
from preprocessing import preprocess_query_text
from vectorize import vectorize_input_text
from cosine import get_cosine_similarity
from refinement import process_results, gather_top_results
from word2vecaverage import average_per_doc
[docs]def download_data():
"""
this is a placeholder function to show that we need to run something in order to procure the data
"""
pass
def process_query(
query_text,
vectorizer,
term_document_matrix,
processed_iati_records,
full_iati_records,
):
processed_query_dataframe = preprocess_query_text(query_text)
vectorized_query = vectorize_input_text(processed_query_dataframe, vectorizer)
df_result = get_cosine_similarity(
vectorized_query, term_document_matrix, processed_iati_records
)
smart_results = process_results(df_result, full_iati_records)
top_results = gather_top_results(smart_results, ORG_ID_COLUMN_NAME, 3)
return top_results
def process_query_embeddings(
query_text, w2v_model, w2v_avg, processed_iati_records, full_iati_records
):
processed_query_dataframe = preprocess_query_text(query_text)
query_average = average_per_doc(
str(processed_query_dataframe[DESCRIPTION_COLUMN_NAME][0]), w2v_model, 300
).reshape(1, -1)
df_result = get_cosine_similarity(query_average, w2v_avg, processed_iati_records)
smart_results = process_results(df_result, full_iati_records)
top_results = gather_top_results(smart_results, ORG_ID_COLUMN_NAME, 3)
return top_results