Source code for ips_python.vectorize

import pickle
from os.path import join
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    from ips_python.utils import get_data_path
    from ips_python.constants import (
        PROCESSED_RECORDS_FILENAME,
        WORD_LIST_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        DESCRIPTION_COLUMN_NAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )
except ModuleNotFoundError:
    from utils import get_data_path
    from constants import (
        PROCESSED_RECORDS_FILENAME,
        WORD_LIST_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        DESCRIPTION_COLUMN_NAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )


[docs]def create_tfidf_term_document_matrix(preprocessed_text_dataframe):
    """
    return a vectorizer object, TFIDF term document matrix and list of words

    input:
        preprocessed_text_dataframe: dataframe of preprocessed text with 'description' column

    output:
        tuple: vectorizer, term_document_matrix, word_list
    """
    vectorizer = TfidfVectorizer(min_df=0)
    term_document_matrix = vectorizer.fit_transform(
        preprocessed_text_dataframe[DESCRIPTION_COLUMN_NAME]
    )

    word_list = vectorizer.get_feature_names()

    return (vectorizer, term_document_matrix, word_list)


def write_tfidf_term_document_matrix_to_file(
    preprocessed_file_name,
    word_list_file_name,
    term_document_matrix_filename,
    vectorizer_filename,
):
    df1 = pd.read_csv(join(get_data_path(), preprocessed_file_name), encoding="utf-8")
    df1 = df1[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]]

    vectorizer, term_document_matrix, word_list = create_tfidf_term_document_matrix(df1)

    with open(
        join(get_data_path(), term_document_matrix_filename), "wb"
    ) as output_file:
        pickle.dump(term_document_matrix, output_file)

    with open(join(get_data_path(), word_list_file_name), "wb") as output_file:
        pickle.dump(word_list, output_file)

    with open(join(get_data_path(), vectorizer_filename), "wb") as output_file:
        pickle.dump(vectorizer, output_file)


[docs]def vectorize_input_text(processed_query_dataframe, vectorizer):
    """
    input:
        processed_query_text: dataframe of processed user text
        vectorizer: TfidfVectorizer object

    output:
        numpy array of vectorized user input
    """
    # use the transform method from the vectorizer
    return vectorizer.transform(processed_query_dataframe[DESCRIPTION_COLUMN_NAME])


if __name__ == "__main__":
    write_tfidf_term_document_matrix_to_file(
        PROCESSED_RECORDS_FILENAME,
        WORD_LIST_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
    )