import pandas as pd
import nltk
from os.path import join
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import time
try:
from ips_python.utils import get_data_path, get_input_path
from ips_python.constants import (
PROCESSED_RECORDS_FILENAME,
INPUT_DATA_FILENAME,
STOPWORDS_FILENAME,
KEEPWORDS_FILENAME,
DESCRIPTION_COLUMN_NAME,
TITLE_COLUMN_NAME,
IATI_IDENTIFIER_COLUMN_NAME,
)
except ModuleNotFoundError:
from utils import get_data_path, get_input_path
from constants import (
PROCESSED_RECORDS_FILENAME,
INPUT_DATA_FILENAME,
STOPWORDS_FILENAME,
KEEPWORDS_FILENAME,
DESCRIPTION_COLUMN_NAME,
TITLE_COLUMN_NAME,
IATI_IDENTIFIER_COLUMN_NAME,
)
def preprocessing_initial_text_clean(p_df, p_text):
# remove na description values
p_df = p_df.dropna(subset=[p_text])
# convert to string:
p_df = p_df.astype(str)
# remove punctuation
p_df[p_text] = p_df[p_text].str.replace(r"[^\w\s]", "")
# remove underscores not picked up as punctuation above
p_df[p_text] = p_df[p_text].str.replace("_", " ")
# remove numbers
p_df[p_text] = p_df[p_text].str.replace(r"[\d+]", "")
# lowercase
p_df[p_text] = p_df[p_text].apply(lambda x: " ".join(x.lower() for x in x.split()))
return p_df
def preprocessing_nonenglish_paragraph_remove(p_df, p_text):
# only English language please
for index, row in p_df.iterrows():
try:
if detect(row[p_text]) != "en":
p_df = p_df.drop(index)
except Exception:
# get rid of any garbage
p_df = p_df.drop(index)
return p_df
def preprocessing_nonenglish_words_remove(p_df, p_text):
wordstokeep = nltk.corpus.words.words()
# Remove word if not in keep list
wordstokeep = append_to_list(
wordstokeep, join(get_input_path(), KEEPWORDS_FILENAME)
)
wordstokeep = [w.lower() for w in wordstokeep]
wordstokeep = split_flatten_list(wordstokeep)
wordstokeep = set(wordstokeep)
p_df[p_text] = p_df[p_text].apply(
lambda x: " ".join(x for x in x.split() if x in wordstokeep)
)
return p_df
def append_to_list(inlist, inputfile):
with open(inputfile, "r") as r:
new_words = r.read().splitlines()
new_words = [w.lower() for w in new_words]
return inlist + new_words
def split_flatten_list(inputlist):
splitlist = [s.split(" ") for s in inputlist]
return [o for i in splitlist for o in i]
def preprocessing_stopwords_remove(p_df, p_text):
# Remove english stop words
stop = stopwords.words("english")
stop = append_to_list(stop, join(get_input_path(), STOPWORDS_FILENAME))
p_df[p_text] = p_df[p_text].apply(
lambda x: " ".join(x for x in x.split() if x not in stop)
)
return p_df
[docs]def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {
"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV,
}
return tag_dict.get(tag, wordnet.NOUN)
def preprocessing_lemmatise(p_df, p_text):
lemmatizer = WordNetLemmatizer()
p_df[p_text] = p_df[p_text].apply(
lambda x: " ".join(
[lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in x.split()]
)
)
return p_df
def preprocessing_stem(p_df, p_text):
st = PorterStemmer()
p_df[p_text] = p_df[p_text].apply(
lambda x: " ".join([st.stem(word) for word in x.split()])
)
return p_df
def preprocessing_empty_text_remove(p_df, p_text):
# Remove na string
p_df = p_df[~p_df[p_text].isna()]
# Remove empty string
p_df = p_df[p_df[p_text] != ""]
# Remove entirely whitespace strings in description column
p_df = p_df[~p_df[p_text].str.isspace()]
# Remove nan stored as string
p_df = p_df[p_df[p_text] != "nan"]
return p_df
def preprocess_query_text(query_text):
# transform into dataframe
df = pd.DataFrame([query_text], columns=[DESCRIPTION_COLUMN_NAME])
# Apply specific preprocessing functions
df = preprocessing_initial_text_clean(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_nonenglish_words_remove(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_stopwords_remove(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_stem(df, DESCRIPTION_COLUMN_NAME)
return preprocessing_empty_text_remove(df, DESCRIPTION_COLUMN_NAME)
[docs]def preprocess_pipeline(df):
"""
Default process for taking the raw IATI data dump and processing the text for vectorizing
Args:
df: dataframe of the raw IATI data with columns including identifier, description and title
Returns:
dataframe of with preprocessed data with _only_ the columns IATI_IDENTIFIER_COLUMN_NAME and DESCRIPTION_COLUMN_NAME
"""
df = df[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME, TITLE_COLUMN_NAME]]
# Remove record in current full dataset with null iati.identifer
df = df[~df[IATI_IDENTIFIER_COLUMN_NAME].str.isspace()]
# If both description and title not NA concatenate them into description column
df.loc[
~df[DESCRIPTION_COLUMN_NAME].isna() & ~df[TITLE_COLUMN_NAME].isna(),
[DESCRIPTION_COLUMN_NAME],
] = (df[TITLE_COLUMN_NAME] + " " + df[DESCRIPTION_COLUMN_NAME])
# If description is NA replace with title
df.loc[df[DESCRIPTION_COLUMN_NAME].isna(), [DESCRIPTION_COLUMN_NAME]] = df[
TITLE_COLUMN_NAME
]
df = df[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]]
# preprocessing
df = preprocessing_initial_text_clean(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_nonenglish_words_remove(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_stopwords_remove(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_stem(df, DESCRIPTION_COLUMN_NAME)
df = preprocessing_empty_text_remove(df, DESCRIPTION_COLUMN_NAME)
return df
if __name__ == "__main__":
start = time.time()
# To import full dataset
df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME), encoding="utf-8")
df = preprocess_pipeline(df)
# write out df with reduced records
df.to_csv(
join(join(get_data_path(), PROCESSED_RECORDS_FILENAME)),
index=False,
encoding="utf-8",
)
end = time.time()
print("completed in {0} seconds".format(end - start))