Word Count Analysis¶

This document describes word loss distribution as a result of preprocessing

[108]:

import pandas as pd
import matplotlib.pyplot as plt
from os.path import join
from constants import (
    PROCESSED_RECORDS_FILENAME,
    INPUT_DATA_FILENAME)
from utils import get_data_path
from preprocessing import preprocessing_initial_text_clean

[95]:

df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME), encoding="iso-8859-1")
preprocessed_df = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="iso-8859-1")

C:\Users\t-wilson\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3057: DtypeWarning: Columns (2,3,7,10,11,12,13,14,15,16,17,18,20,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,46,47,48,49,50,51,52,54,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

[96]:

df = df[['iati.identifier','description', 'title']]

[97]:

# Remove record in current full dataset with null iati.identifer
df = df[~df["iati.identifier"].str.isspace()]

# If both description and title not NA concatenate them into description column
df.loc[~df["description"].isna() & ~df["title"].isna(), ["description"]] = (
    df["title"] + " " + df["description"]
)

# If description is NA replace with title
df.loc[df["description"].isna(), ["description"]] = df["title"]

[98]:

df = preprocessing_initial_text_clean(df, 'description')

[99]:

df['words'] = df['description'].str.count(' ').add(1)
preprocessed_df['words'] = preprocessed_df['description'].str.count(' ').add(1)

[100]:

df = df[df['iati.identifier'].isin(preprocessed_df['iati.identifier'])]

[101]:

print("Initial DF word stats \n{0}".format(df['words'].describe()))

Initial DF word stats
count    783035.000000
mean         48.899398
std          77.993783
min           1.000000
25%          14.000000
50%          26.000000
75%          49.000000
max       12135.000000
Name: words, dtype: float64

[102]:

print("Pre-processed DF word stats \n{0}".format(preprocessed_df['words'].describe()))

Pre-processed DF word stats
count    783035.000000
mean         12.717107
std          21.629170
min           1.000000
25%           2.000000
50%           6.000000
75%          15.000000
max        4110.000000
Name: words, dtype: float64

[123]:

plt.figure(figsize=(20,10))
plt.hist(df['words'], bins=100, range=[0,200], alpha=0.5, label='intial text')
plt.hist(preprocessed_df['words'], bins=100, range=[0,200],alpha=0.5, label='pre-processed text')
plt.legend(prop={'size':20})
plt.xlabel('word count per record')
plt.ylabel('frequency')
plt.show()