Source code for ips_python.vectorize

import pickle
from os.path import join
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    from ips_python.utils import get_data_path
    from ips_python.constants import (
        PROCESSED_RECORDS_FILENAME,
        WORD_LIST_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        DESCRIPTION_COLUMN_NAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )
except ModuleNotFoundError:
    from utils import get_data_path
    from constants import (
        PROCESSED_RECORDS_FILENAME,
        WORD_LIST_FILENAME,
        TERM_DOCUMENT_MATRIX_FILENAME,
        VECTORIZER_FILENAME,
        DESCRIPTION_COLUMN_NAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )


[docs]def create_tfidf_term_document_matrix(preprocessed_text_dataframe): """ return a vectorizer object, TFIDF term document matrix and list of words input: preprocessed_text_dataframe: dataframe of preprocessed text with 'description' column output: tuple: vectorizer, term_document_matrix, word_list """ vectorizer = TfidfVectorizer(min_df=0) term_document_matrix = vectorizer.fit_transform( preprocessed_text_dataframe[DESCRIPTION_COLUMN_NAME] ) word_list = vectorizer.get_feature_names() return (vectorizer, term_document_matrix, word_list)
def write_tfidf_term_document_matrix_to_file( preprocessed_file_name, word_list_file_name, term_document_matrix_filename, vectorizer_filename, ): df1 = pd.read_csv(join(get_data_path(), preprocessed_file_name), encoding="utf-8") df1 = df1[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]] vectorizer, term_document_matrix, word_list = create_tfidf_term_document_matrix(df1) with open( join(get_data_path(), term_document_matrix_filename), "wb" ) as output_file: pickle.dump(term_document_matrix, output_file) with open(join(get_data_path(), word_list_file_name), "wb") as output_file: pickle.dump(word_list, output_file) with open(join(get_data_path(), vectorizer_filename), "wb") as output_file: pickle.dump(vectorizer, output_file)
[docs]def vectorize_input_text(processed_query_dataframe, vectorizer): """ input: processed_query_text: dataframe of processed user text vectorizer: TfidfVectorizer object output: numpy array of vectorized user input """ # use the transform method from the vectorizer return vectorizer.transform(processed_query_dataframe[DESCRIPTION_COLUMN_NAME])
if __name__ == "__main__": write_tfidf_term_document_matrix_to_file( PROCESSED_RECORDS_FILENAME, WORD_LIST_FILENAME, TERM_DOCUMENT_MATRIX_FILENAME, VECTORIZER_FILENAME, )