Source code for ips_python.preprocessing

import pandas as pd
import nltk
from os.path import join
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import time

try:
    from ips_python.utils import get_data_path, get_input_path
    from ips_python.constants import (
        PROCESSED_RECORDS_FILENAME,
        INPUT_DATA_FILENAME,
        STOPWORDS_FILENAME,
        KEEPWORDS_FILENAME,
        DESCRIPTION_COLUMN_NAME,
        TITLE_COLUMN_NAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )
except ModuleNotFoundError:
    from utils import get_data_path, get_input_path
    from constants import (
        PROCESSED_RECORDS_FILENAME,
        INPUT_DATA_FILENAME,
        STOPWORDS_FILENAME,
        KEEPWORDS_FILENAME,
        DESCRIPTION_COLUMN_NAME,
        TITLE_COLUMN_NAME,
        IATI_IDENTIFIER_COLUMN_NAME,
    )


def preprocessing_initial_text_clean(p_df, p_text):
    # remove na description values
    p_df = p_df.dropna(subset=[p_text])
    # convert to string:
    p_df = p_df.astype(str)
    # remove punctuation
    p_df[p_text] = p_df[p_text].str.replace(r"[^\w\s]", "")
    # remove underscores not picked up as punctuation above
    p_df[p_text] = p_df[p_text].str.replace("_", " ")
    # remove  numbers
    p_df[p_text] = p_df[p_text].str.replace(r"[\d+]", "")
    # lowercase
    p_df[p_text] = p_df[p_text].apply(lambda x: " ".join(x.lower() for x in x.split()))
    return p_df


def preprocessing_nonenglish_paragraph_remove(p_df, p_text):
    # only English language please
    for index, row in p_df.iterrows():
        try:
            if detect(row[p_text]) != "en":
                p_df = p_df.drop(index)
        except Exception:
            # get rid of any garbage
            p_df = p_df.drop(index)
    return p_df


def preprocessing_nonenglish_words_remove(p_df, p_text):
    wordstokeep = nltk.corpus.words.words()
    # Remove word if not in keep list
    wordstokeep = append_to_list(
        wordstokeep, join(get_input_path(), KEEPWORDS_FILENAME)
    )
    wordstokeep = [w.lower() for w in wordstokeep]
    wordstokeep = split_flatten_list(wordstokeep)
    wordstokeep = set(wordstokeep)
    p_df[p_text] = p_df[p_text].apply(
        lambda x: " ".join(x for x in x.split() if x in wordstokeep)
    )
    return p_df


def append_to_list(inlist, inputfile):
    with open(inputfile, "r") as r:
        new_words = r.read().splitlines()
        new_words = [w.lower() for w in new_words]
    return inlist + new_words


def split_flatten_list(inputlist):
    splitlist = [s.split(" ") for s in inputlist]
    return [o for i in splitlist for o in i]


def preprocessing_stopwords_remove(p_df, p_text):
    # Remove english stop words
    stop = stopwords.words("english")
    stop = append_to_list(stop, join(get_input_path(), STOPWORDS_FILENAME))
    p_df[p_text] = p_df[p_text].apply(
        lambda x: " ".join(x for x in x.split() if x not in stop)
    )
    return p_df


[docs]def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }

    return tag_dict.get(tag, wordnet.NOUN)


def preprocessing_lemmatise(p_df, p_text):
    lemmatizer = WordNetLemmatizer()
    p_df[p_text] = p_df[p_text].apply(
        lambda x: " ".join(
            [lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in x.split()]
        )
    )
    return p_df


def preprocessing_stem(p_df, p_text):
    st = PorterStemmer()
    p_df[p_text] = p_df[p_text].apply(
        lambda x: " ".join([st.stem(word) for word in x.split()])
    )
    return p_df


def preprocessing_empty_text_remove(p_df, p_text):
    # Remove na string
    p_df = p_df[~p_df[p_text].isna()]
    # Remove empty string
    p_df = p_df[p_df[p_text] != ""]
    # Remove entirely whitespace strings in description column
    p_df = p_df[~p_df[p_text].str.isspace()]
    # Remove nan stored as string
    p_df = p_df[p_df[p_text] != "nan"]
    return p_df


def preprocess_query_text(query_text):
    # transform into dataframe
    df = pd.DataFrame([query_text], columns=[DESCRIPTION_COLUMN_NAME])
    # Apply specific preprocessing functions
    df = preprocessing_initial_text_clean(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_nonenglish_words_remove(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_stopwords_remove(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_stem(df, DESCRIPTION_COLUMN_NAME)
    return preprocessing_empty_text_remove(df, DESCRIPTION_COLUMN_NAME)


[docs]def preprocess_pipeline(df):
    """
    Default process for taking the raw IATI data dump and processing the text for vectorizing

    Args:
        df: dataframe of the raw IATI data with columns including identifier, description and title

    Returns:
        dataframe of with preprocessed data with _only_ the columns IATI_IDENTIFIER_COLUMN_NAME and DESCRIPTION_COLUMN_NAME
    """
    df = df[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME, TITLE_COLUMN_NAME]]

    # Remove record in current full dataset with null iati.identifer
    df = df[~df[IATI_IDENTIFIER_COLUMN_NAME].str.isspace()]

    # If both description and title not NA concatenate them into description column
    df.loc[
        ~df[DESCRIPTION_COLUMN_NAME].isna() & ~df[TITLE_COLUMN_NAME].isna(),
        [DESCRIPTION_COLUMN_NAME],
    ] = (df[TITLE_COLUMN_NAME] + " " + df[DESCRIPTION_COLUMN_NAME])

    # If description is NA replace with title
    df.loc[df[DESCRIPTION_COLUMN_NAME].isna(), [DESCRIPTION_COLUMN_NAME]] = df[
        TITLE_COLUMN_NAME
    ]

    df = df[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]]

    # preprocessing
    df = preprocessing_initial_text_clean(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_nonenglish_words_remove(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_stopwords_remove(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_stem(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_empty_text_remove(df, DESCRIPTION_COLUMN_NAME)
    return df


if __name__ == "__main__":
    start = time.time()

    # To import full dataset
    df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME), encoding="utf-8")

    df = preprocess_pipeline(df)

    # write out df with reduced records
    df.to_csv(
        join(join(get_data_path(), PROCESSED_RECORDS_FILENAME)),
        index=False,
        encoding="utf-8",
    )

    end = time.time()

    print("completed in {0} seconds".format(end - start))