Country | Country and parent code | Population Density | Year |
---|---|---|---|
Burundi | CC108_PC108 | 449.01 | 2019 |
Comoros | CC174_PC174 | 457.222 | 2019 |
Djibouti | CC262_PC262 | 41.9999 | 2019 |
Eritrea | CC232_PC232 | 34.6249 | 2019 |
Ethiopia | CC231_PC231 | 112.079 | 2019 |
Kenya | CC404_PC404 | 92.3744 | 2019 |
Madagascar | CC450_PC450 | 46.3553 | 2019 |
Malawi | CC454_PC454 | 197.59 | 2019 |
Mauritius | CC480_PC480 | 625.453 | 2019 |
Mayotte | CC175_PC175 | 709.741 | 2019 |
Mozambique | CC508_PC508 | 38.615 | 2019 |
Réunion | CC638_PC638 | 355.573 | 2019 |
Rwanda | CC646_PC646 | 511.834 | 2019 |
Seychelles | CC690_PC690 | 212.48 | 2019 |
Somalia | CC706_PC706 | 24.6165 | 2019 |
South Sudan | CC728_PC728 | 18.1064 | 2019 |
Uganda | CC800_PC800 | 221.558 | 2019 |
United Republic of Tanzania | CC834_PC834 | 65.4837 | 2019 |
Zambia | CC894_PC894 | 24.0265 | 2019 |
Zimbabwe | CC716_PC716 | 37.8583 | 2019 |
Angola | CC24_PC24 | 25.5276 | 2019 |
Cameroon | CC120_PC120 | 54.7405 | 2019 |
Central African Republic | CC140_PC140 | 7.6169 | 2019 |
Chad | CC148_PC148 | 12.6643 | 2019 |
Congo | CC178_PC178 | 15.7555 | 2019 |
Democratic Republic of the Congo | CC180_PC180 | 38.2835 | 2019 |
Equatorial Guinea | CC226_PC226 | 48.3416 | 2019 |
Gabon | CC266_PC266 | 8.43163 | 2019 |
Sao Tome and Principe | CC678_PC678 | 224.008 | 2019 |
Botswana | CC72_PC72 | 4.0649 | 2019 |
Eswatini | CC748_PC748 | 66.7519 | 2019 |
Lesotho | CC426_PC426 | 70.0022 | 2019 |
Namibia | CC516_PC516 | 3.02995 | 2019 |
South Africa | CC710_PC710 | 48.272 | 2019 |
Benin | CC204_PC204 | 104.657 | 2019 |
Burkina Faso | CC854_PC854 | 74.2741 | 2019 |
Cabo Verde | CC132_PC132 | 136.461 | 2019 |
Côte d’Ivoire | CC384_PC384 | 80.8697 | 2019 |
Gambia | CC270_PC270 | 231.986 | 2019 |
Ghana | CC288_PC288 | 133.681 | 2019 |
Guinea | CC324_PC324 | 51.9748 | 2019 |
Guinea-Bissau | CC624_PC624 | 68.3114 | 2019 |
Liberia | CC430_PC430 | 51.2601 | 2019 |
Mali | CC466_PC466 | 16.1106 | 2019 |
Mauritania | CC478_PC478 | 4.3909 | 2019 |
Niger | CC562_PC562 | 18.4027 | 2019 |
Nigeria | CC566_PC566 | 220.652 | 2019 |
Saint Helena | CC654_PC654 | 15.541 | 2019 |
Senegal | CC686_PC686 | 84.6432 | 2019 |
Sierra Leone | CC694_PC694 | 108.246 | 2019 |
Togo | CC768_PC768 | 148.6 | 2019 |
Algeria | CC12_PC12 | 18.0763 | 2019 |
Egypt | CC818_PC818 | 100.847 | 2019 |
Libya | CC434_PC434 | 3.85183 | 2019 |
Morocco | CC504_PC504 | 81.7203 | 2019 |
Sudan | CC729_PC729 | 24.2561 | 2019 |
Tunisia | CC788_PC788 | 75.275 | 2019 |
Western Sahara | CC732_PC732 | 2.18969 | 2019 |
Armenia | CC51_PC51 | 103.889 | 2019 |
Azerbaijan | CC31_PC31 | 121.558 | 2019 |
Bahrain | CC48_PC48 | 2159.43 | 2019 |
Cyprus | CC196_PC196 | 129.716 | 2019 |
Georgia | CC268_PC268 | 57.5156 | 2019 |
Iraq | CC368_PC368 | 90.5088 | 2019 |
Israel | CC376_PC376 | 393.686 | 2019 |
Jordan | CC400_PC400 | 113.783 | 2019 |
Kuwait | CC414_PC414 | 236.087 | 2019 |
Lebanon | CC422_PC422 | 670.157 | 2019 |
Oman | CC512_PC512 | 16.0743 | 2019 |
Qatar | CC634_PC634 | 243.934 | 2019 |
Saudi Arabia | CC682_PC682 | 15.9411 | 2019 |
State of Palestine | CC275_PC275 | 827.479 | 2019 |
Syrian Arab Republic | CC760_PC760 | 92.9594 | 2019 |
Turkey | CC792_PC792 | 108.402 | 2019 |
United Arab Emirates | CC784_PC784 | 116.872 | 2019 |
Yemen | CC887_PC887 | 55.2341 | 2019 |
Kazakhstan | CC398_PC398 | 6.87166 | 2019 |
Kyrgyzstan | CC417_PC417 | 33.4507 | 2019 |
Tajikistan | CC762_PC762 | 66.5978 | 2019 |
Turkmenistan | CC795_PC795 | 12.6446 | 2019 |
Uzbekistan | CC860_PC860 | 77.5311 | 2019 |
Afghanistan | CC4_PC4 | 58.2694 | 2019 |
Bangladesh | CC50_PC50 | 1252.56 | 2019 |
Bhutan | CC64_PC64 | 20.0198 | 2019 |
India | CC356_PC356 | 459.58 | 2019 |
Iran (Islamic Republic of) | CC364_PC364 | 50.9127 | 2019 |
Maldives | CC462_PC462 | 1769.86 | 2019 |
Nepal | CC524_PC524 | 199.572 | 2019 |
Pakistan | CC586_PC586 | 280.933 | 2019 |
Sri Lanka | CC144_PC144 | 340.037 | 2019 |
China | CC156_PC156 | 152.722 | 2019 |
China, Hong Kong SAR | CC344_PC344 | 7082.05 | 2019 |
China, Macao SAR | CC446_PC446 | 21419.6 | 2019 |
China, Taiwan Province of China | CC158_PC158 | 671.389 | 2019 |
Dem. People’s Republic of Korea | CC408_PC408 | 213.156 | 2019 |
Japan | CC392_PC392 | 347.987 | 2019 |
Mongolia | CC496_PC496 | 2.07598 | 2019 |
Republic of Korea | CC410_PC410 | 526.847 | 2019 |
Brunei Darussalam | CC96_PC96 | 82.2194 | 2019 |
Cambodia | CC116_PC116 | 93.3976 | 2019 |
Indonesia | CC360_PC360 | 149.387 | 2019 |
Lao People’s Democratic Republic | CC418_PC418 | 31.0635 | 2019 |
Malaysia | CC458_PC458 | 97.2448 | 2019 |
Myanmar | CC104_PC104 | 82.7281 | 2019 |
Philippines | CC608_PC608 | 362.601 | 2019 |
Singapore | CC702_PC702 | 8291.92 | 2019 |
Thailand | CC764_PC764 | 136.283 | 2019 |
Timor-Leste | CC626_PC626 | 86.9617 | 2019 |
Viet Nam | CC704_PC704 | 311.098 | 2019 |
Anguilla | CC660_PC660 | 165.244 | 2019 |
Antigua and Barbuda | CC28_PC28 | 220.716 | 2019 |
Aruba | CC533_PC533 | 590.611 | 2019 |
Bahamas | CC44_PC44 | 38.9097 | 2019 |
Barbados | CC52_PC52 | 667.491 | 2019 |
Bonaire, Sint Eustatius and Saba | CC535_PC535 | 79.2165 | 2019 |
British Virgin Islands | CC92_PC92 | 200.22 | 2019 |
Cayman Islands | CC136_PC136 | 270.617 | 2019 |
Cuba | CC192_PC192 | 106.478 | 2019 |
Curaçao | CC531_PC531 | 368.07 | 2019 |
Dominica | CC212_PC212 | 95.744 | 2019 |
Dominican Republic | CC214_PC214 | 222.247 | 2019 |
Grenada | CC308_PC308 | 329.418 | 2019 |
Guadeloupe | CC312_PC312 | 245.73 | 2019 |
Haiti | CC332_PC332 | 408.675 | 2019 |
Jamaica | CC388_PC388 | 272.232 | 2019 |
Martinique | CC474_PC474 | 354.299 | 2019 |
Montserrat | CC500_PC500 | 49.91 | 2019 |
Puerto Rico | CC630_PC630 | 330.711 | 2019 |
Saint Barthélemy | CC652_PC652 | 447.864 | 2019 |
Saint Kitts and Nevis | CC659_PC659 | 203.208 | 2019 |
Saint Lucia | CC662_PC662 | 299.664 | 2019 |
Saint Martin (French part) | CC663_PC663 | 717.019 | 2019 |
Saint Vincent and the Grenadines | CC670_PC670 | 283.572 | 2019 |
Sint Maarten (Dutch part) | CC534_PC534 | 1246.74 | 2019 |
Trinidad and Tobago | CC780_PC780 | 271.924 | 2019 |
Turks and Caicos Islands | CC796_PC796 | 40.2042 | 2019 |
United States Virgin Islands | CC850_PC850 | 298.797 | 2019 |
Belize | CC84_PC84 | 17.1132 | 2019 |
Costa Rica | CC188_PC188 | 98.8555 | 2019 |
El Salvador | CC222_PC222 | 311.465 | 2019 |
Guatemala | CC320_PC320 | 164.068 | 2019 |
Honduras | CC340_PC340 | 87.1044 | 2019 |
Mexico | CC484_PC484 | 65.627 | 2019 |
Nicaragua | CC558_PC558 | 54.3917 | 2019 |
Panama | CC591_PC591 | 57.1219 | 2019 |
Argentina | CC32_PC32 | 16.3631 | 2019 |
Bolivia (Plurinational State of) | CC68_PC68 | 10.6278 | 2019 |
Brazil | CC76_PC76 | 25.2508 | 2019 |
Chile | CC152_PC152 | 25.4892 | 2019 |
Colombia | CC170_PC170 | 45.3713 | 2019 |
Ecuador | CC218_PC218 | 69.9535 | 2019 |
Falkland Islands (Malvinas) | CC238_PC238 | 0.277075 | 2019 |
French Guiana | CC254_PC254 | 3.53799 | 2019 |
Guyana | CC328_PC328 | 3.9765 | 2019 |
Paraguay | CC600_PC600 | 17.7313 | 2019 |
Peru | CC604_PC604 | 25.3988 | 2019 |
Suriname | CC740_PC740 | 3.72669 | 2019 |
Uruguay | CC858_PC858 | 19.7791 | 2019 |
Venezuela (Bolivarian Republic of) | CC862_PC862 | 32.329 | 2019 |
Australia | CC36_PC36 | 3.28068 | 2019 |
New Zealand | CC554_PC554 | 18.1651 | 2019 |
Fiji | CC242_PC242 | 48.7113 | 2019 |
New Caledonia | CC540_PC540 | 15.4681 | 2019 |
Papua New Guinea | CC598_PC598 | 19.3793 | 2019 |
Solomon Islands | CC90_PC90 | 23.9307 | 2019 |
Vanuatu | CC548_PC548 | 24.6007 | 2019 |
Guam | CC316_PC316 | 309.806 | 2019 |
Kiribati | CC296_PC296 | 145.195 | 2019 |
Marshall Islands | CC584_PC584 | 326.617 | 2019 |
Micronesia (Fed. States of) | CC583_PC583 | 162.587 | 2019 |
Nauru | CC520_PC520 | 538.2 | 2019 |
Northern Mariana Islands | CC580_PC580 | 124.376 | 2019 |
Palau | CC585_PC585 | 39.1326 | 2019 |
American Samoa | CC16_PC16 | 276.56 | 2019 |
Cook Islands | CC184_PC184 | 73.1125 | 2019 |
French Polynesia | CC258_PC258 | 76.3074 | 2019 |
Niue | CC570_PC570 | 6.20769 | 2019 |
Samoa | CC882_PC882 | 69.6442 | 2019 |
Tokelau | CC772_PC772 | 133 | 2019 |
Tonga | CC776_PC776 | 145.135 | 2019 |
Tuvalu | CC798_PC798 | 388.5 | 2019 |
Wallis and Futuna Islands | CC876_PC876 | 81.6857 | 2019 |
Belarus | CC112_PC112 | 46.5842 | 2019 |
Bulgaria | CC100_PC100 | 64.4815 | 2019 |
Czechia | CC203_PC203 | 138.39 | 2019 |
Hungary | CC348_PC348 | 106.978 | 2019 |
Poland | CC616_PC616 | 123.723 | 2019 |
Republic of Moldova | CC498_PC498 | 123.082 | 2019 |
Romania | CC642_PC642 | 84.1315 | 2019 |
Russian Federation | CC643_PC643 | 8.90721 | 2019 |
Slovakia | CC703_PC703 | 113.48 | 2019 |
Ukraine | CC804_PC804 | 75.9401 | 2019 |
Channel Islands | CC830_PC830 | 906.653 | 2019 |
Denmark | CC208_PC208 | 136.033 | 2019 |
Estonia | CC233_PC233 | 31.2727 | 2019 |
Faroe Islands | CC234_PC234 | 34.8689 | 2019 |
Finland | CC246_PC246 | 18.2045 | 2019 |
Iceland | CC352_PC352 | 3.38192 | 2019 |
Ireland | CC372_PC372 | 70.8738 | 2019 |
Isle of Man | CC833_PC833 | 148.402 | 2019 |
Latvia | CC428_PC428 | 30.655 | 2019 |
Lithuania | CC440_PC440 | 44.0315 | 2019 |
Norway | CC578_PC578 | 14.7258 | 2019 |
Sweden | CC752_PC752 | 24.4587 | 2019 |
United Kingdom | CC826_PC826 | 279.131 | 2019 |
Albania | CC8_PC8 | 105.143 | 2019 |
Andorra | CC20_PC20 | 164.14 | 2019 |
Bosnia and Herzegovina | CC70_PC70 | 64.7255 | 2019 |
Croatia | CC191_PC191 | 73.8081 | 2019 |
Gibraltar | CC292_PC292 | 3370.6 | 2019 |
Greece | CC300_PC300 | 81.2525 | 2019 |
Holy See | CC336_PC336 | 1852.27 | 2019 |
Italy | CC380_PC380 | 205.855 | 2019 |
Malta | CC470_PC470 | 1376.18 | 2019 |
Montenegro | CC499_PC499 | 46.6906 | 2019 |
North Macedonia | CC807_PC807 | 82.6113 | 2019 |
Portugal | CC620_PC620 | 111.652 | 2019 |
San Marino | CC674_PC674 | 564.4 | 2019 |
Serbia | CC688_PC688 | 100.3 | 2019 |
Slovenia | CC705_PC705 | 103.21 | 2019 |
Spain | CC724_PC724 | 93.6984 | 2019 |
Austria | CC40_PC40 | 108.667 | 2019 |
Belgium | CC56_PC56 | 381.087 | 2019 |
France | CC250_PC250 | 118.946 | 2019 |
Germany | CC276_PC276 | 239.606 | 2019 |
Liechtenstein | CC438_PC438 | 237.625 | 2019 |
Luxembourg | CC442_PC442 | 237.734 | 2019 |
Monaco | CC492_PC492 | 26152.3 | 2019 |
Netherlands | CC528_PC528 | 507.032 | 2019 |
Switzerland | CC756_PC756 | 217.415 | 2019 |
Bermuda | CC60_PC60 | 1250.16 | 2019 |
Canada | CC124_PC124 | 4.11404 | 2019 |
Greenland | CC304_PC304 | 0.138044 | 2019 |
Saint Pierre and Miquelon | CC666_PC666 | 25.3087 | 2019 |
United States of America | CC840_PC840 | 35.9735 | 2019 |
Modular Programming in Python
1 Components of Modular Programming
Functions, modules and packages structure programs, making them:
- more readable
- easier to fix
- simpler to add new features to
A module is a file that contains one or more units of code - in our case: functions. A collection of module files together forms a package.
You have already been using functions, modules and packages that were written by other people. For example pandas
.
You can use automated tests to check that each component of your code performs as expected.
Testing sections of your code independently using code - “Unit Testing” - is a concept covered in further courses. To do this your code needs to use functions, modules and packages.
Testing multiple sections together and their interactions with each other is called - “Integration testing”.
To show how to structure code in an analysis context we will use an example scenario to go through the steps taken.
2 Introducing the Project
You have been assigned to a group within your department responsible for analysing populations across the world. This work is in collaboration with the United Nations.
Your job is to provide analysis of population densities across the different United Nations Sustainable Development Goal (SDG) regions. You must provide average population density values for each SDG region.
One of your colleagues has already conducted this analysis on an ad hoc basis. They have given you their code to start with, but they have only analysed one year of data so far. You have been asked to write code that will be able to analyse multiple years of data, all in different files.
Before tackling the big task of analysing all the data, you are going to restructure your colleagues code. To make the process more reproducible in the future you will restructure their code into functions and modules.
This process is called “refactoring”.
Refactoring is a process of improving your code, while keeping it able to perform the same task. This helps clean the code and improve its design.
You have been sent two data sets needed to reproduce the analysis your colleague performed. Have a look through the data, what steps do you think need to be considered to make the data analysable?
2.1 Population Density
The population_density_2019.csv
data contains each country’s population density, name and a unique country and parent group code column. There is also a year column.
The data is only from 2019.
2.2 Location IDs
The locations.csv
data contains each countries location ID (the same as a country code), and which Sustainable Development Goal Region the location ID is part of.
The data is valid for all years.
Location ID | SDG Region Name |
---|---|
“108” | Sub-Saharan Africa |
“174” | Sub-Saharan Africa |
“262” | Sub-Saharan Africa |
“232” | Sub-Saharan Africa |
“231” | Sub-Saharan Africa |
“404” | Sub-Saharan Africa |
“450” | Sub-Saharan Africa |
“454” | Sub-Saharan Africa |
“480” | Sub-Saharan Africa |
“175” | Sub-Saharan Africa |
“508” | Sub-Saharan Africa |
“638” | Sub-Saharan Africa |
“646” | Sub-Saharan Africa |
“690” | Sub-Saharan Africa |
“706” | Sub-Saharan Africa |
“728” | Sub-Saharan Africa |
“800” | Sub-Saharan Africa |
“834” | Sub-Saharan Africa |
“894” | Sub-Saharan Africa |
“716” | Sub-Saharan Africa |
“24” | Sub-Saharan Africa |
“120” | Sub-Saharan Africa |
“140” | Sub-Saharan Africa |
“148” | Sub-Saharan Africa |
“178” | Sub-Saharan Africa |
“180” | Sub-Saharan Africa |
“226” | Sub-Saharan Africa |
“266” | Sub-Saharan Africa |
“678” | Sub-Saharan Africa |
“72” | Sub-Saharan Africa |
“748” | Sub-Saharan Africa |
“426” | Sub-Saharan Africa |
“516” | Sub-Saharan Africa |
“710” | Sub-Saharan Africa |
“204” | Sub-Saharan Africa |
“854” | Sub-Saharan Africa |
“132” | Sub-Saharan Africa |
“384” | Sub-Saharan Africa |
“270” | Sub-Saharan Africa |
“288” | Sub-Saharan Africa |
“324” | Sub-Saharan Africa |
“624” | Sub-Saharan Africa |
“430” | Sub-Saharan Africa |
“466” | Sub-Saharan Africa |
“478” | Sub-Saharan Africa |
“562” | Sub-Saharan Africa |
“566” | Sub-Saharan Africa |
“654” | Sub-Saharan Africa |
“686” | Sub-Saharan Africa |
“694” | Sub-Saharan Africa |
“768” | Sub-Saharan Africa |
“12” | Northern Africa and Western Asia |
“818” | Northern Africa and Western Asia |
“434” | Northern Africa and Western Asia |
“504” | Northern Africa and Western Asia |
“729” | Northern Africa and Western Asia |
“788” | Northern Africa and Western Asia |
“732” | Northern Africa and Western Asia |
“51” | Northern Africa and Western Asia |
“31” | Northern Africa and Western Asia |
“48” | Northern Africa and Western Asia |
“196” | Northern Africa and Western Asia |
“268” | Northern Africa and Western Asia |
“368” | Northern Africa and Western Asia |
“376” | Northern Africa and Western Asia |
“400” | Northern Africa and Western Asia |
“414” | Northern Africa and Western Asia |
“422” | Northern Africa and Western Asia |
“512” | Northern Africa and Western Asia |
“634” | Northern Africa and Western Asia |
“682” | Northern Africa and Western Asia |
“275” | Northern Africa and Western Asia |
“760” | Northern Africa and Western Asia |
“792” | Northern Africa and Western Asia |
“784” | Northern Africa and Western Asia |
“887” | Northern Africa and Western Asia |
“398” | Central and Southern Asia |
“417” | Central and Southern Asia |
“762” | Central and Southern Asia |
“795” | Central and Southern Asia |
“860” | Central and Southern Asia |
“4” | Central and Southern Asia |
“50” | Central and Southern Asia |
“64” | Central and Southern Asia |
“356” | Central and Southern Asia |
“364” | Central and Southern Asia |
“462” | Central and Southern Asia |
“524” | Central and Southern Asia |
“586” | Central and Southern Asia |
“144” | Central and Southern Asia |
“156” | Eastern and South-Eastern Asia |
“344” | Eastern and South-Eastern Asia |
“446” | Eastern and South-Eastern Asia |
“158” | Eastern and South-Eastern Asia |
“408” | Eastern and South-Eastern Asia |
“392” | Eastern and South-Eastern Asia |
“496” | Eastern and South-Eastern Asia |
“410” | Eastern and South-Eastern Asia |
“96” | Eastern and South-Eastern Asia |
“116” | Eastern and South-Eastern Asia |
“360” | Eastern and South-Eastern Asia |
“418” | Eastern and South-Eastern Asia |
“458” | Eastern and South-Eastern Asia |
“104” | Eastern and South-Eastern Asia |
“608” | Eastern and South-Eastern Asia |
“702” | Eastern and South-Eastern Asia |
“764” | Eastern and South-Eastern Asia |
“626” | Eastern and South-Eastern Asia |
“704” | Eastern and South-Eastern Asia |
“660” | Latin America and the Caribbean |
“28” | Latin America and the Caribbean |
“533” | Latin America and the Caribbean |
“44” | Latin America and the Caribbean |
“52” | Latin America and the Caribbean |
“535” | Latin America and the Caribbean |
“92” | Latin America and the Caribbean |
“136” | Latin America and the Caribbean |
“192” | Latin America and the Caribbean |
“531” | Latin America and the Caribbean |
“212” | Latin America and the Caribbean |
“214” | Latin America and the Caribbean |
“308” | Latin America and the Caribbean |
“312” | Latin America and the Caribbean |
“332” | Latin America and the Caribbean |
“388” | Latin America and the Caribbean |
“474” | Latin America and the Caribbean |
“500” | Latin America and the Caribbean |
“630” | Latin America and the Caribbean |
“652” | Latin America and the Caribbean |
“659” | Latin America and the Caribbean |
“662” | Latin America and the Caribbean |
“663” | Latin America and the Caribbean |
“670” | Latin America and the Caribbean |
“534” | Latin America and the Caribbean |
“780” | Latin America and the Caribbean |
“796” | Latin America and the Caribbean |
“850” | Latin America and the Caribbean |
“84” | Latin America and the Caribbean |
“188” | Latin America and the Caribbean |
“222” | Latin America and the Caribbean |
“320” | Latin America and the Caribbean |
“340” | Latin America and the Caribbean |
“484” | Latin America and the Caribbean |
“558” | Latin America and the Caribbean |
“591” | Latin America and the Caribbean |
“32” | Latin America and the Caribbean |
“68” | Latin America and the Caribbean |
“76” | Latin America and the Caribbean |
“152” | Latin America and the Caribbean |
“170” | Latin America and the Caribbean |
“218” | Latin America and the Caribbean |
“238” | Latin America and the Caribbean |
“254” | Latin America and the Caribbean |
“328” | Latin America and the Caribbean |
“600” | Latin America and the Caribbean |
“604” | Latin America and the Caribbean |
“740” | Latin America and the Caribbean |
“858” | Latin America and the Caribbean |
“862” | Latin America and the Caribbean |
“36” | Australia/New Zealand |
“554” | Australia/New Zealand |
“242” | Oceania (excluding Australia and New Zealand) |
“540” | Oceania (excluding Australia and New Zealand) |
“598” | Oceania (excluding Australia and New Zealand) |
“90” | Oceania (excluding Australia and New Zealand) |
“548” | Oceania (excluding Australia and New Zealand) |
“316” | Oceania (excluding Australia and New Zealand) |
“296” | Oceania (excluding Australia and New Zealand) |
“584” | Oceania (excluding Australia and New Zealand) |
“583” | Oceania (excluding Australia and New Zealand) |
“520” | Oceania (excluding Australia and New Zealand) |
“580” | Oceania (excluding Australia and New Zealand) |
“585” | Oceania (excluding Australia and New Zealand) |
“16” | Oceania (excluding Australia and New Zealand) |
“184” | Oceania (excluding Australia and New Zealand) |
“258” | Oceania (excluding Australia and New Zealand) |
“570” | Oceania (excluding Australia and New Zealand) |
“882” | Oceania (excluding Australia and New Zealand) |
“772” | Oceania (excluding Australia and New Zealand) |
“776” | Oceania (excluding Australia and New Zealand) |
“798” | Oceania (excluding Australia and New Zealand) |
“876” | Oceania (excluding Australia and New Zealand) |
“112” | Europe and Northern America |
“100” | Europe and Northern America |
“203” | Europe and Northern America |
“348” | Europe and Northern America |
“616” | Europe and Northern America |
“498” | Europe and Northern America |
“642” | Europe and Northern America |
“643” | Europe and Northern America |
“703” | Europe and Northern America |
“804” | Europe and Northern America |
“830” | Europe and Northern America |
“208” | Europe and Northern America |
“233” | Europe and Northern America |
“234” | Europe and Northern America |
“246” | Europe and Northern America |
“352” | Europe and Northern America |
“372” | Europe and Northern America |
“833” | Europe and Northern America |
“428” | Europe and Northern America |
“440” | Europe and Northern America |
“578” | Europe and Northern America |
“752” | Europe and Northern America |
“826” | Europe and Northern America |
“8” | Europe and Northern America |
“20” | Europe and Northern America |
“70” | Europe and Northern America |
“191” | Europe and Northern America |
“292” | Europe and Northern America |
“300” | Europe and Northern America |
“336” | Europe and Northern America |
“380” | Europe and Northern America |
“470” | Europe and Northern America |
“499” | Europe and Northern America |
“807” | Europe and Northern America |
“620” | Europe and Northern America |
“674” | Europe and Northern America |
“688” | Europe and Northern America |
“705” | Europe and Northern America |
“724” | Europe and Northern America |
“40” | Europe and Northern America |
“56” | Europe and Northern America |
“250” | Europe and Northern America |
“276” | Europe and Northern America |
“438” | Europe and Northern America |
“442” | Europe and Northern America |
“492” | Europe and Northern America |
“528” | Europe and Northern America |
“756” | Europe and Northern America |
“60” | Europe and Northern America |
“124” | Europe and Northern America |
“304” | Europe and Northern America |
“666” | Europe and Northern America |
“840” | Europe and Northern America |
2.3 Steps
To analyse the data we will need to have one single data frame. We must join locations and population densities on a column.
At the moment there is no exact matching column to join on, therefore we will need to manipulate columns.
Both data frames contain a “country code” value somewhere. For the Population Density dataframe, the country code will need to be separated from the parent code. There are also prefixes “CC” and “PC” that we will need to consider. For the Location IDs dataframe, the quotation marks will need to be removed.
Once the population densities have their respective SDG region in the same table the data can be aggregated. The data will be grouped by SDG region, then the mean will be calculated on the population density value.
We have stated above that the data is valid for all years, meaning that we expect the structure to be consistent. Once the 2019 data is clean, what things should we consider about applying our program to other years?
3 Building Programs
Before getting started on the task of analysing the population density data, it is important that we are aware of different styles of programming.
3.1 Basic Programs
Scripts and notebooks can be really useful tools for quick analysis, however, they limit how we can scale and improve our project.
Our scripts become one line after another of data being slightly changed at each step.
This does not group the code in a structure helpful for us to understand.
This style of programming is sometimes referred to as “imperative”.
Programmers frequently copy and paste code to reuse it in different parts of a program, with small changes.
If the requirements of our project change, we need to hunt through the code to change all the relevant variables and values. If code sections have been copied and pasted, fixing an error in one place won’t fix the copies.
If the project expands we need to write more and more code. This is often done in the same file, making the code harder to work through and understand.
3.2 Grouping Code
To structure our code better we need to be able to group a collection of code together into one object. This can be done in two ways:
- converting to functions
- converting to classes
Classes are beyond the scope of this course and are less prevalent in R, so we will focus on functions here. However, many of these principles can also be applied to classes.
Properties of functions:
- functions complete a task
- functions can take inputs and give outputs
Functions can be run in one line of code, running complicated operations that have been written elsewhere. This helps “hide” some of the detail, making it clearer what is happening in the code - a process known as extraction.
Well-named functions mean we do not need to understand the details inside the function - just what they achieve.
Within this course there is a programming styles document, explaining some of the different styles of programming. This is suggested further reading at this point in the course.
There are some important principles to keep in mind when we design functions:
- functions should not have “side-effects”. Data outside the function should not be impacted by using the function
- functions should serve a single purpose
4 Scripts to Functions
In this section we will discuss considerations when converting scripts to functions. In this section we will use an example script to show the steps involved structuring code.
4.1 Example Analysis Code
The code that has been given to you by your colleague is given in this section. At present it is a script that is well commented, but not well structured. Your task is to structure the code allowing for future reproducible analysis.
At a high level, the code:
- loads in the two data sets
- cleans the data
- joins the data so all useful information is together
- calculates an aggregate statistic
- tidies the output data
- writes the data to a CSV file
Have a read through the script you have received, be sure to look up any sections you are not comfortable with.
If you would prefer to look at it within an IDE it is located in
example_code_python/initial_script/
For all scripts and files throughout this course it is assumed that the working directory being used is the location of the file being run. This may need to be changed in your given IDE.
# File to analyse the mean population density data from the UN
# Import relevant libraries for analysis
import pandas as pd
import os
# Load the population density data 2019
= os.path.join("../../data/population_density_2019.csv")
population_path = pd.read_csv(population_path)
pop_density
# Clean the column names, following naming conventions similar to PEP8
= pop_density.columns.str.lower()
pop_density.columns = pop_density.columns.str.replace(" ", "_")
pop_density.columns
# The country_and_parent_code column needs to
# be split into two columns without the strings
"country_code", "parent_code"]] = (pop_density["country_and_parent_code"]
pop_density[[str.split("_", expand=True))
.
# Remove the country_and_parent_code and parent_code columns, not used in later analysis
# axis=1 to remove the columns
= pop_density.drop(labels=[
pop_density "country_and_parent_code",
"parent_code"
], =1)
axis
# Convert country_code to integer by removing strings
"country_code"] = pop_density["country_code"].str.replace("CC", "")
pop_density["country_code"] = pop_density["country_code"].astype(int)
pop_density[
# Load the locations data to get the Sustainable Development Goals sub regions
= os.path.join("../../data/locations.csv")
locations_path = pd.read_csv(locations_path)
locations
# Clean the column names, following naming conventions similar to PEP8
= locations.columns.str.lower()
locations.columns = locations.columns.str.replace(" ", "_")
locations.columns
# The location_id data has quotation marks making it a string,
# it needs to be converted to a numeric
"location_id"] = locations["location_id"].str.replace('"', '')
locations["location_id"] = locations["location_id"].astype(int)
locations[
# Join the data sets
# Left merge so we keep all pop_density data
= pop_density.merge(locations,
pop_density_location ="left",
how="country_code",
left_on="location_id")
right_on
# Remove the location_id column as it is equal to country_code or missing
= pop_density_location.drop(labels=["location_id"], axis=1)
pop_density_location
# Get just the relevant columns in preparation
# for the following groupby
= pop_density_location[["sdg_region_name", "population_density"]]
region_density
# Calculate the mean population density for each region
# A non-weighted mean
= (region_density.groupby('sdg_region_name', as_index=False)
region_mean_density "population_density": "mean"}))
.agg({
= region_mean_density.rename(columns={"population_density": "mean_population_density"})
region_mean_density
# Sort the data for clearer reading
= region_mean_density.sort_values(by="mean_population_density",
region_mean_density =False)
ascending
# Round mean density for clearer reading
"mean_population_density"] = region_mean_density["mean_population_density"].round(2)
region_mean_density[
# Write out the final output
"mean_population_density_output.csv", index=False) region_mean_density.to_csv(
Output data:
sdg_region_name | mean_population_density |
---|---|
Eastern and South-Eastern Asia | 2112.67 |
Europe and Northern America | 764.93 |
Central and Southern Asia | 330.63 |
Northern Africa and Western Asia | 234.38 |
Latin America and the Caribbean | 199.62 |
Oceania (excluding Australia and New Zealand) | 144.2 |
Sub-Saharan Africa | 126.55 |
Australia/New Zealand | 10.72 |
4.2 Grouping Code by Functionality
Chunks of code that do similar things should be grouped together.
Deciding which sections of code make sense as being part of the same function is a common challenge when structuring code.
When converting code into a function - the main thing we look for is that it achieves one task. It may take us a few lines of code to achieve this “one task” - but the point is the function has a specific purpose.
If a function has more than one task or “responsibility” it will become hard to maintain, as it has many reasons to be modified.
If a function has a single “responsibility”, it will be focussed and much more likely to be reusable elsewhere.
When writing scripts, we often repeat the same tasks at different points in the script. These are good parts of code to start converting into functions. Doing so reduces the amount of code written in the file - and makes what is happening at any step clearer.
You may also wish to consider writing helper functions for any common housekeeping tasks that you tend to commonly require.
If a code block isn’t repeated throughout the code that’s okay too - all the code can be converted to functions to be called one after the other.
It is much easier to read a sequence of well-named functions, rather than a long stream of commands.
Some code is often very similar, with a variable or two difference in areas of the code. When reading the code, it’s important to think about what is happening to the variables and data involved. Consider whether a similar process is happening elsewhere, rather than whether the same data is involved. These repeating processes present opportunities to reduce the overall length of your script by writing your own custom functions.
Returning to our example script, we are going to take one task, convert it into a function, then improve the function so it can be used multiple times.
The lines of code:
- load in a data frame given a path
- reformat the column names of the data frame
4.2.1 Initial Code
# Import relevant libraries for analysis
import pandas as pd
import os
# Load the population density data 2019
= os.path.join("../..data/population_density_2019.csv")
population_path = pd.read_csv(population_path)
pop_density
# Clean the column names, following naming conventions similar to PEP8
= pop_density.columns.str.lower()
pop_density.columns = pop_density.columns.str.replace(" ", "_") pop_density.columns
4.2.2 Basic Function
We can wrap the code into a function so that all the code can be run with one command like so.
def load_formatted_pop_frame()
"""Read population data and reformat column names"""
# Load the population density data 2019
= os.path.join("../..data/population_density_2019.csv")
population_path = pd.read_csv(population_path)
pop_density
# Clean the column names, following naming conventions similar to PEP8
= pop_density.columns.str.lower()
pop_density.columns = pop_density.columns.str.replace(" ", "_")
pop_density.columns
return pop_density
# Call the function to assign the data frame
= load_formatted_pop_frame() population_density
4.2.3 Adding Parameters
To improve the function, we can add as an argument something that may change in the future - the path of the data string.
Consider how you would have to change the previous function if the location of the population_density_2019.csv
file changed.
Variable names in functions should reflect what that variable is. If you don’t know exactly the value the variable will take, then a generic name like dataframe
is appropriate. Though consider the framework that you are working in - avoid reserved words or well-established, commonly used function names.
When we add an argument to a function replacing a value within, we need to be sure to change all times that original variable was used.
Our comments should reflect the changes made too.
Note that comments should add information - the comments in this tutorial are reminders of why we are doing this, and not the style of comment you would be expected to write. Often, if functions and variables are well-named, the code does not require many comments.
The new function can now be used for both the population_density
data and locations.csv
.
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
# Load the csv from given path
= os.path.join(path_to_data)
formatted_path = pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
# The path can be updated where the function is run if needed
= load_formatted_frame("/data/population_density_2019.csv")
population_density
# The same function is used to load a formatted locations.csv
= load_formatted_frame("/data/locations.csv") locations
4.3 Scope
Scope is an important concept when creating functions and structuring code.
Scope refers to the places in a program that a variable can be accessed.
When writing scripts, variables can be accessed anywhere in the script - so long as the variable assignment has been run.
When we write scripts, we are storing all our variables at the highest, most accessible area of the program. This is referred to as “Global Scope”.
Variables with global scope are accessible in all locations of the program.
This is the easiest way to store variables when learning to program.
However, using global variables throughout our analysis often creates unexpected results in our code. If a new piece of code accidentally alters a global variable, it will affect all the code run after it, even if the function wasn’t meant to update the variable… errors like this can be very tricky to track down and fix.
Some variables can only be accessed in certain locations within a program. When this happens, it is referred to as “Local Scope”.
Variables have local scope if they are accessible within a part of a program such as a function. They cannot be accessed outside the function they are assigned in.
At the highest level of scope are the parts of the programming language that can be accessed anywhere - the built in functions (e.g. print()
).
To make our functions follow functional programming principles we need to keep variable scope in mind.
When designing functions:
- all variables within should either:
- be passed as arguments to the function
- be created within the function
- variables with Global Scope should only be given as arguments to the function
- although all function can access global variables, doing so makes our code harder to understand
- if we need to access data with local scope (within a function) it needs to be returned by the function
If we are clear about what variables we are accessing, we can be sure about what their values are. Using only variables passed as arguments clarifies what data a function is operating on, and makes it much easier to reuse elsewhere (as it just needs its arguments defined, no hidden dependencies on global variables).
Think of your functions as having an entrance and an exit.
- the entrance is the arguments and variables it takes as inputs
- the exit is the value it returns
When choosing which parameters to give a function there are a few things to consider:
- scope
- clarity - avoid bundling parameters together into an object, make sure each parameter is clearly named
- purpose - only include parameters needed for the task
Not all functions need to return a value, such as a function that writes out a file. In this case do not use a return statement, making it clear nothing will be returned. By default if there is no return statement in a function Python will return None
4.3.1 Function Inputs
Below are examples of code which have similar purposes, one uses parameter variables well, the other does not.
This is bad because we are altering data that has not been passed as arguments to the function.
= ["a", "b", "c", "d", "e"]
letters
def add_letter():
= letters + ["f"]
long_letters return long_letters
# Run on original data
print(add_letter())
# the value of letters could be changed elsewhere in the program
= ["1", "2", "3", "4", "5"]
letters
# Without changing our function call at all we get a different result
# with the same function call
print(add_letter())
['a', 'b', 'c', 'd', 'e', 'f']
['1', '2', '3', '4', '5', 'f']
This is better because the function is more clearly dependent on the input values.
= ["a", "b", "c", "d", "e"]
letters
def add_letter(charcter_list):
= charcter_list + ["f"]
long_letters return long_letters
# Run on the original data
print("Initial")
print(add_letter(letters))
# If the data changes later so does our result
= ["1", "2", "3", "4", "5"]
letters
# This time we can see why the result is different
# We need to check the value of `letters`
print("Changed")
print(add_letter(letters))
Initial
['a', 'b', 'c', 'd', 'e', 'f']
Changed
['1', '2', '3', '4', '5', 'f']
4.3.2 Data Frame Considerations
As analysts and data scientists, we will often use data frames in our programs.
There are some special considerations that need to be taken when working with these objects, with regards to functional programming principles.
We need to avoid unintentionally altering an existing object when we give them to functions.
This is primarily a challenge in Python / pandas
due to Python being an object oriented language.
If we give a data frame to a function, some pandas
code can edit the original data frame “in-place” from outside of the scope of the function.
This means any other variables in the code referring to that data frame will be changed unintentionally.
Below is an example of the unintentional effect this can have.
= pd.DataFrame(columns=["first", "second"])
initial_frame
print("initial_frame before function called:", initial_frame)
def add_values(dataframe):
"first", "second"]] = pd.Series(["value1, value2"]).str.split(", ", expand=True)
dataframe[[
return dataframe
= add_values(initial_frame)
changed_frame
print("changed_frame:", changed_frame)
print("initial_frame after function called:", initial_frame)
# Without intending we have edited the inital_frames
print("Inital and new frame are equal:", initial_frame.equals(changed_frame))
initial_frame before function called: Empty DataFrame
Columns: [first, second]
Index: []
changed_frame: first second
0 value1 value2
initial_frame after function called: first second
0 value1 value2
Inital and new frame are equal: True
There are different approaches to prevent this phenomena. We will look at one in-particular to tackle the problem.
A local copy of the original dataframe can be made within each function that takes as a parameter a data frame. This local copy is worked with and manipulated, then returned.
This prevents the original object from being changed.
The .copy()
method is used to achieve this.
= pd.DataFrame(columns=["first", "second"])
initial_frame
print("initial_frame before function called:", initial_frame)
def add_values(dataframe):
# Typically the original data frame name is overwritten
# This avoids potential naming issues
= dataframe.copy()
dataframe
"first", "second"]] = pd.Series(["value1, value2"]).str.split(", ", expand=True)
dataframe[[
return dataframe
= add_values(initial_frame)
changed_frame
print("changed_frame:", changed_frame)
print("initial_frame after function called:", initial_frame)
# Check if we have edited the original
print("Inital and new frame are equal:", initial_frame.equals(changed_frame))
initial_frame before function called: Empty DataFrame
Columns: [first, second]
Index: []
changed_frame: first second
0 value1 value2
initial_frame after function called: Empty DataFrame
Columns: [first, second]
Index: []
Inital and new frame are equal: False
This approach helps to prevent side effects of our functions.
There is however a trade-off - as we make more copies of data our memory usage will increase.
While this is rarely a problem for small data sets, it is something to keep in mind as your projects get bigger.
There are design approaches that can be used to reduce the memory usage of your program.
For example, removing unneeded data and duplicates, piping and manipulating data with inplace
parameters can help.
4.4 Exercises
Using the code snippets from the example analysis below, write a function that:
- takes an input population density data frame
- splits the
country_and_parent_code
column intoparent_code
andcountry_code
columns - drops the
country_and_parent_code
andparent_code
columns - returns the new data frame
Add this function into the file example_code_python/function_input/exercise1.py
or example_code_R/function_input/exercise1.R
depending on your chosen framework. Use the code already there to test your result on pop_density
.
Name the function access_country_code()
.
# The country_and_parent_code column needs to
# be split into two columns without the strings
"country_code", "parent_code"]] = (pop_density["country_and_parent_code"]
pop_density[[str.split("_", expand=True))
.
# Remove the country_and_parent_code and parent_code columns, not used in later analysis
# axis=1 to remove the columns
= pop_density.drop(labels=[
pop_density "country_and_parent_code",
"parent_code"
],=1) axis
import pandas as pd
import os
## Code to be improved to complete exercise 1
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
= os.path.join(path_to_data)
formatted_path # Load the csv from given path
= pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
# Copy the incoming data to prevent editing the original
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
# Loading both data frames
= load_formatted_frame("../../data/population_density_2019.csv")
pop_density = load_formatted_frame("../../data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code print(pop_density_single_code["country_code"])
Using the code snippets from our example analysis below, write a function that:
- takes a data frame as an input
- can replace a string within a specified column
- can convert the type of a given column
This function will be used across both data frames later - so be sure it is general enough to work for both. In addition, it must use only data it gets as arguments.
Add this function into the file example_code/function_input/exercise2.py
. Use the code already there to test your result on locations
and pop_density
.
Name the function convert_type_to_int()
.
# Replace specific string in column
"location_id"] = locations["location_id"].str.replace('"', '')
locations[
# Convert the type of the column
"location_id"] = locations["location_id"].astype(int) locations[
import pandas as pd
import os
## Code to be improved to complete exercise 2
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
= os.path.join(path_to_data)
formatted_path # Load the csv from given path
= pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
## Run the functions created
= load_formatted_frame("../../data/population_density_2019.csv")
pop_density = load_formatted_frame("../../data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
print(pop_density_correct_types.dtypes)
print(locations_correct_types.dtypes)
Using the code snippets from our example analysis below, write a function that:
- takes two data frames as inputs
- takes two string inputs
- performs a left join on a column from each data frame, the columns are given by the strings input
- removes the second specified string column from the joined data frame
- returns a single data frame
This function will be used after the previous functions using the data frames outputted.
Add this function into the file example_code_python|R/function_input/exercise3.py|r
. Use the code already there to test your result on the new data frame.
This function will be useful for our specific case, but also if we want to join other data frames or use different column names.
Our column names could change if we change an upstream function, so it’s important we give them as inputs.
Name the function join_frames()
.
# Join the data sets
# Left merge so we keep all pop_density data
= pop_density.merge(locations,
pop_density_location ="left",
how="country_code",
left_on="location_id")
right_on
# Remove the location_id column as it is equal to country_code or missing
= pop_density_location.drop(labels=["location_id"], axis=1) pop_density_location
import pandas as pd
import os
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
= os.path.join(path_to_data)
formatted_path # Load the csv from given path
= pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
def join_frames(left_dataframe, right_dataframe, left_column, right_column):
"""
Function to join the required frames on specified columns, dropping
unrecessary column
"""
= left_dataframe.copy()
left_dataframe = right_dataframe.copy()
right_dataframe
= left_dataframe.merge(right=right_dataframe,
combined_frames ="left",
how=left_column,
left_on=right_column)
right_on
= combined_frames.drop(labels=[right_column], axis=1)
combined_frames_reduced
return combined_frames_reduced
## Run the functions created
= load_formatted_frame("../../data/population_density_2019.csv")
pop_density = load_formatted_frame("../../data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
print(population_location.columns)
print(population_location.head(10))
4.5 High Level Functions
This section will introduce some concepts and good practice that are relevant for when you have converted your script into functions.
In the section below, a version of code with all tasks broken into functions is shown. To help consolidate your learning from the previous exercises, an extension exercise is to convert the remaining code to functions yourself.
4.5.1 Extension Exercise
Using exercise3_answers.py
convert the remaining script code into functions. The functions should be called:
aggregate_statistic()
format_frame()
write_output()
Each of these functions perform one task. They are general enough that they work for our specific situation but leave some room for if we wanted to make minor adjustments upstream, such as column or filenames.
Side Note: We are writing the function write_output()
as practice, it only contains one single line of code so in practice it wouldn’t be used as a function. It’s important to avoid writing functions that are too small.
import pandas as pd
import os
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
= os.path.join(path_to_data)
formatted_path # Load the csv from given path
= pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
def join_frames(left_dataframe, right_dataframe, left_column, right_column):
"""
Function to join the required frames on specified columns, dropping
unrecessary column
"""
= left_dataframe.copy()
left_dataframe = right_dataframe.copy()
right_dataframe
= left_dataframe.merge(right=right_dataframe,
combined_frames ="left",
how=left_column,
left_on=right_column)
right_on
= combined_frames.drop(labels=[right_column], axis=1)
combined_frames_reduced
return combined_frames_reduced
def aggregate_mean(dataframe, groupby_column, statistic_column):
"""Function to groupby and calculate the aggregate mean of two columns"""
= dataframe.copy()
dataframe
# Remove unecessary columns
= dataframe[[groupby_column, statistic_column]]
subset
# Perform mean calculation
= (subset.groupby(groupby_column, as_index=False)
statistic "mean"}))
.agg({statistic_column:
= statistic.rename(columns={statistic_column: "mean_" + aggregate_column})
statistic_renamed
return statistic_renamed
def format_frame(dataframe, statistic_column):
"""Function to format the dataframe for output"""
= dataframe.copy()
dataframe
= dataframe.sort_values(by=statistic_column,
dataframe_sorted =False)
ascending
= dataframe_sorted[statistic_column].round(2)
dataframe_sorted[statistic_column]
return dataframe_sorted
def write_output(dataframe, output_filepath):
"""Function to write output statistic in formatted manner"""
=False, sep=",")
dataframe.to_csv(output_filepath, index
# We are not returning anything so our function
# does not need a return value. By default this
# will return `None`
## Run the functions created
= load_formatted_frame("../../data/population_density_2019.csv")
pop_density = load_formatted_frame("../../data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
= aggregate_mean(dataframe=population_location,
aggreagation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= format_frame(aggreagation, "mean_population_density")
formatted_statistic
"./mean_pop_density.csv") write_output(formatted_statistic,
4.5.2 Execute Program
Now we have converted all our code tasks into functions we can run each function, passing their output into the input of the next function.
Looking at the code at the end of our script there are a group of lines which describe the running of the program. These lines of code describe the whole analysis, showing each step in the process with a function corresponding to each step.
When we hit “Run” on our code, the code shown is run. The functions above it in the file are loaded into the program’s global scope, allowing them to be called by this code.
## Run the functions created
= load_formatted_frame("/data/population_density_2019.csv")
pop_density = load_formatted_frame("/data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
= aggregate_mean(dataframe=population_location,
aggregation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= format_frame(aggregation, "mean_population_density")
formatted_statistic
="./mean_pop_density.csv") write_output(formatted_statistic, output_filepath
4.5.3 Main Function
The code above makes what we are doing much easier to understand. To find out what the code is doing at each step, we can just read the name of the function, or look up what it does in the documentation.
The way the code is currently designed, however, still uses variables in the global scope, something to generally avoid.
If we add one more function, that calls our other functions, we can run our whole program by calling this one function. This will make it much easier to run the analysis later down the line, and to extend our code into modules and packages.
Functions that run other functions are called “high level” functions. Using high level functions lets us build more structure to our code.
Often the convention you will see for naming a highest level function in code is calling it main()
, however it does not have to be this name. We will call our highest level analysis get_analyse_output()
.
In effect, we put all the code that was used to “run” the program within the get_analyse_output()
function. This way we can run the program only when we call get_analyse_output()
.
This is the point where typical convention between Python and R starts to differ. Be sure to check both methods if you regularly code in both.
How many levels of “high level” functions we have should be proprortionate to our code. For a small task we probably don’t need high level functions. For a larger pipeline they become significantly more important.
In Python there is an extra line of code we add to help us split our code into modules.
The line is as follows:
if __name__ == "__main__":
#Add your code to run here
When Python runs a file (module) the interpreter assigns it a value for the attribute __name__
.
When we click “Run” in our IDE (such as Spyder), or run a script directly in command line, the __name__
value of that file run is "__main__"
.
If the file (module) is imported elsewhere in a program, then the value of __name__
is not equal to "__main__"
. It is instead assigned the name of the module file. Therefore any code within the block if __name__ == "__main__":
will not be run.
At the moment, that is not very useful to us, but when we start expanding our code into multiple files (Convert to Modules) it becomes key.
If we want to alter the behaviour of the get_analyse_output()
function we have two options:
- alter the main function to change variables passed to the function
- add parameters
Below is our get_analyse_output()
function, and the code used to run it.
def get_analyse_output():
"""
Access the data, run the analysis of population density means over locations,
output the data into a CSV.
"""
= load_formatted_frame("/data/population_density_2019.csv")
pop_density = load_formatted_frame("/data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
= aggregate_mean(dataframe=population_location,
aggregation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= format_frame(aggregation, "mean_population_density")
formatted_statistic
="./mean_pop_density.csv")
write_output(formatted_statistic, output_filepath
if __name__ == "__main__":
get_analyse_output()
If we were to use this analysis on different data sets, it may be useful for us to be able to change the data inputs and outputs.
def get_analyse_output(population_filepath, location_filepath, output_filepath):
"""
Access the data, run the analysis of population density means over locations,
output the data into a CSV.
"""
= load_formatted_frame(population_filepath)
pop_density = load_formatted_frame(location_filepath)
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
= aggregate_mean(dataframe=population_location,
aggregation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= format_frame(aggregation, "mean_population_density")
formatted_statistic
write_output(formatted_statistic, output_filepath)
if __name__ == "__main__":
="/data/population_density_2019.csv",
get_analyse_output(population_filepath="/data/locations.csv",
location_filepath="./mean_pop_density.csv") output_filepath
4.6 Hierarchies
We have now introduced a higher level function that runs other functions for us.
This is a great step forward in structuring our code. If we want to understand what the program does:
- we first look at this high level
get_analyse_output()
function - each function within the higher function describes a step of the process, a task
- for more information on how each task is completed, the function can be found in the script
By having some functions that call others we now have levels and dependencies of functions.
Well documented high-level functions mean we do not need to dive into the lower level functions to understand what the code does.
These relationships between functions can be described with hierarchical diagrams. Writing down the relationship between tasks in your code is an extremely useful practice in structuring code.
Below is what the code in main_func.py
looks like as a hierarchy of functions.
As you can see, a lot of steps are being run by the single get_analyse_output()
function. It is really important we have this high level function, but we can have more if it makes the structure of our program clearer.
Below we will first look at a new code diagram with a different structure to the previous, then the code it corresponds to.
This is slight overkill for our program at the moment due to it’s small size, but the principle is very useful as our code becomes more complex.
The new structure:
- still has a highest level
get_analyse_output()
function - contains multiple functions in between the lowest level and the highest
- has middle functions which perform a larger task, grouping smaller tasks together
Note that we have not added an additional higher level function above
write_output()
. This is because we don’t need to have a higher function calling just one lower level function. In addition, we do not always want to write out data out while we test the analysis pipeline.
The benefits of this structure is that we can more easily access the data produced by our pipeline at relevant steps.
- if we want to look at the joined data after cleaning and manipulation we just call the
extract_transform()
function - to perform a different analysis on the cleaned frame we can write a different
analyse()
function and call that instead withinget_analyse_output()
4.6.1 New Structure
import pandas as pd
import os
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
= os.path.join(path_to_data)
formatted_path # Load the csv from given path
= pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
def join_frames(left_dataframe, right_dataframe, left_column, right_column):
"""
Function to join the required frames on specified columns, dropping
unrecessary column
"""
= left_dataframe.copy()
left_dataframe = right_dataframe.copy()
right_dataframe
= left_dataframe.merge(right=right_dataframe,
combined_frames ="left",
how=left_column,
left_on=right_column)
right_on
= combined_frames.drop(labels=[right_column], axis=1)
combined_frames_reduced
return combined_frames_reduced
def aggregate_mean(dataframe, groupby_column, aggregate_column):
"""Function to groupby and calculate the aggregate mean of two columns"""
= dataframe.copy()
dataframe
# Remove unecessary columns
= dataframe[[groupby_column, aggregate_column]]
subset
# Perform mean calculation
= (subset.groupby(groupby_column, as_index=False)
statistic "mean"}))
.agg({aggregate_column:
= statistic.rename(columns={aggregate_column: "mean_" + aggregate_column})
statistic_renamed
return statistic_renamed
def format_frame(dataframe, statistic_column):
"""Function to format the dataframe for output"""
= dataframe.copy()
dataframe
= dataframe.sort_values(by=statistic_column,
dataframe_sorted =False)
ascending
= dataframe_sorted[statistic_column].round(2)
dataframe_sorted[statistic_column]
return dataframe_sorted
def write_output(dataframe, output_filepath):
"""Function to write output statistic in formatted manner"""
=False, sep=",")
dataframe.to_csv(output_filepath, index
def extract_transform(population_filepath, location_filepath):
"""Load the data and convert it to clean joined format for analysis"""
= load_formatted_frame(population_filepath)
pop_density = load_formatted_frame(location_filepath)
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
return population_location
def analyse(full_dataframe, groupby_column, aggregate_column, statistic_column):
"""Function to perform groupby mean of population density and reformat result"""
= full_dataframe.copy()
full_dataframe
= aggregate_mean(dataframe=full_dataframe,
aggreagation =groupby_column,
groupby_column=aggregate_column)
aggregate_column
= format_frame(aggreagation, statistic_column=statistic_column)
formatted_statistic
return formatted_statistic
def get_analyse_output(population_filepath, location_filepath, output_filepath):
"""
Access the data, run the analysis of population density means over locations,
output the data into a csv.
"""
= extract_transform(population_filepath=population_filepath,
population_location =location_filepath)
location_filepath
= analyse(full_dataframe=population_location,
formatted_statistic ="sdg_region_name",
groupby_column="population_density",
aggregate_column="mean_population_density")
statistic_column
write_output(formatted_statistic, output_filepath)
if __name__ == "__main__":
="../../data/population_density_2019.csv",
get_analyse_output(population_filepath="../../data/locations.csv",
location_filepath="./mean_pop_density.csv") output_filepath
4.7 Interaction
“Who will need to access this part of the program?” is a useful question to think about when structuring your code.
As the main developer you will likely be accessing the whole code base, every function.
To run the program a user only needs to interact with a small part of the program. The part of the program a user will be interacting with is called the “application programming interface”, API. Other areas of the code can be seen, but rarely used by the user.
Parts of your code can be “hidden” from the user. The end user does not need to understand every line of code or function. The user only needs to run the program.
By structuring the code properly, it is possible to “hide” the private parts from users - they do not need to understand or access the inner workings of every function - they just need to run the program.
In our code the API part would be the get_analyse_output()
function.
Separating public facing and lower level functions improves clarity and usability. All code should be as clear as possible whether it is the API or lower to help with future development.
Having this distinction allows us to test the code at the correct levels.
By having a hierarchy of functions with distinctions about what the API is can make the code simpler. Structuring the code well makes it easier to run, test and fix for developers and users.
This concept becomes more important in:
- software products with non-technical users
- object-oriented programming, using public interfaces
Ideally, a user does not need to open any code files to run analysis. Instead, the user can work with a graphical user interface (GUI) or command line interface. Parameters such as the input data file paths and output paths are written in a separate file or by the user in the interface.
“What will the end product of my analysis pipeline be?” is an important question to consider when structuring your project.
5 Functions to Modules
Earlier in this course a scenario was introduced explaining that a single script can grow large and become difficult to maintain.
Although adding structure with functions makes our code better, it can make it longer. Larger code files are difficult to maintain and understand.
We can make our code even clearer and better-structured by moving the functions in our code into different files. By grouping related functions together into different files it will be easier to look up different parts of our code. We no longer need to scroll through thousands of lines of code, we just navigate to the relevant file.
When we move functions (or other objects) into different files, they then need to be imported back into the file we are using those functions in.
When we move code into different files the code in files are imported as “modules” in Python.
Before structuring code in different files, we need to discuss how to structure our directory properly to help us with this.
5.1 Project Structure
Now we are moving beyond working with just one script we need to consider our project, files, folders/directories and paths.
A key part of building a reproducible collection of code is making the project folder simple to understand, navigate and work with.
There is no single folder structure that is perfect for all analysis, however, there are good minimum requirements and guiding principles.
The situation to avoid is having all your data, course code, notebooks and documentation in the same location. This is confusing to anyone else looking in, and makes it harder for your project to be extended.
In this section we will outline basic components of project structure, their relevance to this course, and point to good resources for deciding your own approach.
5.1.1 Guiding Principles
The main principles are
- the complexity of the folder structure should be in line with the size/complexity of the project
- smaller analysis should a have simpler folder structure
- larger projects require more depth of structure (more sections, more folders dividing areas)
- different file types should generally be separated, for example keep the .py files together, the CSVs together, the Jupyter notebooks in one place
- what the end product of the project is should impact the structure. If the code is to become a package, an appropriate structure should be used.
5.1.2 Minimum requirements
A directory structure for analysis should separate the:
- data used to analyse
- course code to perform analysis
- report generation / notebook files, figures and images
- documentation
- READMEs, licenses, package requirements
In addition, relevant version control folders/files will be present (not covered in this course) - .git folder, .gitignore file.
How this is done may depend on your team, language, and specific use case.
An example folder structure for our project is shown below, this is a minimum and could be extended.
Note: /src/
stands for “source” - referring to your source code, the files your program is written with.
5.1.3 Additional content
There are other folders and considerations to structure your project beyond the minimum.
You may want to have separate folders for:
- different parts of your code within the
/src/
folder - references such as data dictionaries and user manuals
- Further divisions of your
/data/
folder - models or other output products produced
- notebooks for enhanced documentations and examples
- A way to produce example data
- environment building
In Python there are alternative resources for data science project structure:
For both Python and R there is a project structure designed by the Government Digital Service for data science projects.
- Cookiecutter project structure for data science in government The repository is designed to use best practices. It uses pre-populated templates to encourage collaboration. In addition it prevents the committing of passwords and rendered notebooks.
5.2 Using Separate Files
Now that we are aware of good project folder structure, we can discuss separating our big full code file into more logical smaller files.
This section will focus on the code contained within the /src/
folder shown in the last section.
Group functions with similar purpose together, such as data cleaning
, loading
, modelling
. Make each file/module as focussed as possible to make it easy to find any required function.
To move the functions between files there are four main steps that need to be taken:
- move the code between files (copy and paste)
- check the new file can access all the code it needs
- import the new file / function into the relevant files in the code base
- check this has not affected how our code runs (test it hasn’t changed)
Moving code between files when a script already exists is a task that can be avoided by designing your project files in a useful way when starting to write your code. Any new analysis should make use of existing modules that you have created.
5.2.1 Moving Code
In this section we will learn how to move functions between files.
In the earlier part of the course “Function Inputs” we discussed why it is important that variables are only accessed through function inputs and outputs. This principle is even more important when moving code between files.
We are first going to make a new file called input_output.py
. This file is going to contain all the code we need for loading and exporting our data frames. It is good practice to group related functions into the same file - especially around data access.
In addition, we are going to rename our original script to main.py
. This is the file that will run all our code.
Within the input_output.py
file we are going to put the following functions, removing them from main.py
:
load_formatted_frame()
write_output()
Our files will now appear as below. Note, they will not currently run.
File contains most of the code used to run the program.
import pandas as pd
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
def join_frames(left_dataframe, right_dataframe, left_column, right_column):
"""
Function to join the required frames on specified columns, dropping
unrecessary column
"""
= left_dataframe.copy()
left_dataframe = right_dataframe.copy()
right_dataframe
= left_dataframe.merge(right=right_dataframe,
combined_frames ="left",
how=left_column,
left_on=right_column)
right_on
= combined_frames.drop(labels=[right_column], axis=1)
combined_frames_reduced
return combined_frames_reduced
def aggregate_mean(dataframe, groupby_column, aggregate_column):
"""Function to groupby and calculate the aggregate mean of two columns"""
= dataframe.copy()
dataframe
# Remove unecessary columns
= dataframe[[groupby_column, aggregate_column]]
subset
# Perform mean calculation
= (subset.groupby(groupby_column, as_index=False)
statistic "mean"}))
.agg({aggregate_column:
= statistic.rename(columns={aggregate_column: "mean_" + aggregate_column})
statistic_renamed
return statistic_renamed
def format_frame(dataframe, statistic_column):
"""Function to format the dataframe for output"""
= dataframe.copy()
dataframe
= dataframe.sort_values(by=statistic_column,
dataframe_sorted =False)
ascending
= dataframe_sorted[statistic_column].round(2)
dataframe_sorted[statistic_column]
return dataframe_sorted
def get_analyse_output():
"""
Access the data, run the analysis of population density means over locations,
output the data into a csv.
"""
= load_formatted_frame("../../data/population_density_2019.csv")
pop_density = load_formatted_frame("../../data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
= aggregate_mean(dataframe=population_location,
aggreagation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= format_frame(aggreagation, "mean_population_density")
formatted_statistic
="./mean_pop_density.csv")
write_output(formatted_statistic, output_filepath
if __name__ == "__main__":
get_analyse_output()
File contains the functions used for input and output operations.
import pandas as pd
import os
def load_formatted_frame(path_to_data):
"""Read csv and reformat column names"""
= os.path.join(path_to_data)
formatted_path # Load the csv from given path
= pd.read_csv(formatted_path)
dataframe
# Clean the column names, following naming conventions similar to PEP8
= dataframe.columns.str.lower()
dataframe.columns = dataframe.columns.str.replace(" ", "_")
dataframe.columns
return dataframe
def write_output(dataframe, output_filepath):
"""Function to write output statistic in formatted manner"""
=False, sep=",")
dataframe.to_csv(output_filepath, index
5.2.2 Loading Code Between Files
The code shown above will not run because the main.py
code cannot access the functions contained within input_output.py
.
For a program to access code in another location the functions need to be loaded into that program explicitly. In Python this is called importing a module.
We load the code from one file into the another allowing our code to access the contents of the loaded file.
Loading a file puts the objects within into the scope of our program.
If we load a file’s code in the global scope of our program, then the file’s contents will be accessible anywhere in the program. If we load the file in a specific local scope it will only be accessible in that local area.
In Python we load our own modules (files) in a very similar way to how we load third party packages such as pandas
, matplotlib
or numpy
.
Python will search a range of locations to find the module we have requested. For now we will assume that all modules are in the same directory as the file we are loading them from.
When a module is imported into a file it becomes an object. The contents of the module can be accessed from this module object.
To import a module (file)
import input_output
In order to use a function from the module, we prepend the function name with the module name; this would also work for classes.
import input_output
= input_output.load_formatted_frame(path_to_data="./data.csv") dataframe
This method makes it clear from which module each function is being used.
This means we will need to change our existing main.py
file to use the correct module name and function.
Alternatively, if we only need one specific function from a module, we can import the function on its own.
from input_output import load_formatted_frame
= load_formatted_frame(path_to_data="./data.csv") dataframe
It is good practice to avoid loading in specific functions where there may already be functions in the program with the same or similar names to prevent overwriting of functions or confusion around names.
Below is the main.py
file which imports the input_output
module. The function calls in main
have been changed to refer to the input_output
module. Read through the code to see the changes.
By convention modules are loaded at the top of a file. This makes it clear what modules are used in the code and ensures all parts of the code that need the module can access it.
import pandas as pd
import input_output
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
def join_frames(left_dataframe, right_dataframe, left_column, right_column):
"""
Function to join the required frames on specified columns, dropping
unrecessary column
"""
= left_dataframe.copy()
left_dataframe = right_dataframe.copy()
right_dataframe
= left_dataframe.merge(right=right_dataframe,
combined_frames ="left",
how=left_column,
left_on=right_column)
right_on
= combined_frames.drop(labels=[right_column], axis=1)
combined_frames_reduced
return combined_frames_reduced
def aggregate_mean(dataframe, groupby_column, aggregate_column):
"""Function to groupby and calculate the aggregate mean of two columns"""
= dataframe.copy()
dataframe
# Remove unecessary columns
= dataframe[[groupby_column, aggregate_column]]
subset
# Perform mean calculation
= (subset.groupby(groupby_column, as_index=False)
statistic "mean"}))
.agg({aggregate_column:
= statistic.rename(columns={aggregate_column: "mean_" + aggregate_column})
statistic_renamed
return statistic_renamed
def format_frame(dataframe, statistic_column):
"""Function to format the dataframe for output"""
= dataframe.copy()
dataframe
= dataframe.sort_values(by=statistic_column,
dataframe_sorted =False)
ascending
= dataframe_sorted[statistic_column].round(2)
dataframe_sorted[statistic_column]
return dataframe_sorted
def get_analyse_output():
"""
Access the data, run the analysis of population density means over locations,
output the data into a csv.
"""
# Added the module name here
= input_output.load_formatted_frame("../../../data/population_density_2019.csv")
pop_density = input_output.load_formatted_frame("../../../data/locations.csv")
locations
= access_country_code(pop_density)
pop_density_single_code
= convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
= aggregate_mean(dataframe=population_location,
aggreagation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= format_frame(aggreagation, "mean_population_density")
formatted_statistic
# Added the module name here
="./mean_pop_density.csv")
input_output.write_output(formatted_statistic, output_filepath
if __name__ == "__main__":
get_analyse_output()
If we want to debug or manually test our functions within input_output
we can add the following line of code to the end of our script. The code within this block will only run if we run the input_output.py
file itself, not the main.py
file.
if __name__ == "__main__":
# code to debug and test
As you look at others’ code you will see a variety of ways to import modules. Below we discuss two frequent conventions.
You can change the name of an imported module for ease of use by using the
as
keyword.
This is often seen with common third-party packages - such as pandas
or numpy
but will work for any module.
import pandas as pd
import numpy as np
This does not change the functionality of the module at all, just changes the name.
All objects in a module can be imported to the global scope at once, this should be avoided.
The below code is being shown to make you aware of what can be done, but should be avoided.
Instead of importing the module and then specific functions from within - or using the module itself all functions within can be imported. This is done using the *
operator. This signifies “all”.
The below code would make all the functions in input_output
available to use in the global scope.
from input_output import *
= load_formatted_frame("path_to_population") population
This may seem useful, but it will cause us to overwrite functions of the same name, and make it unclear from which module each function came from. It is not clear in the example above that load_formatted_frame()
comes from input_output
. For large projects this will cause issues in readability and bug fixing.
5.3 Exercises
These exercises will help you practice splitting code into different files and loading them back into the main.py
script.
Create a new file in the example_code/modules/exercises/start/
folder called analysis.py
.
Put the following functions within the new file:
aggregate_mean()
format_frame()
Change the code in main.py
such that the file loads the relevant functions and runs the whole analysis.
Create a new file in the example_code/modules/exercises/start/
folder called manipulation.py
.
Put the following functions within the new file:
convert_type_to_int()
access_country_code()
join_frames()
Change the code in main.py
such that the file loads the relevant functions and runs the whole analysis.
import pandas as pd
def aggregate_mean(dataframe, groupby_column, aggregate_column):
"""Function to groupby and calculate the aggregate mean of two columns"""
= dataframe.copy()
dataframe
# Remove unecessary columns
= dataframe[[groupby_column, aggregate_column]]
subset
# Perform mean calculation
= (subset.groupby(groupby_column, as_index=False)
statistic "mean"}))
.agg({aggregate_column:
= statistic.rename(columns={aggregate_column: "mean_" + aggregate_column})
statistic_renamed
return statistic_renamed
def format_frame(dataframe, statistic_column):
"""Function to format the dataframe for output"""
= dataframe.copy()
dataframe
= dataframe.sort_values(by=statistic_column,
dataframe_sorted =False)
ascending
= dataframe_sorted[statistic_column].round(2)
dataframe_sorted[statistic_column]
return dataframe_sorted
import pandas as pd
def access_country_code(dataframe):
"""Function to split combined code columns and remove uncessary columns"""
= dataframe.copy()
dataframe
"country_code", "parent_code"]] = (dataframe["country_and_parent_code"]
dataframe[[str.split("_", expand=True))
.
= dataframe.drop(labels=[
dataframe_dropped "country_and_parent_code",
"parent_code"
=1)
], axisreturn dataframe_dropped
def convert_type_to_int(dataframe, column_name, string_value):
"""Function to convert string to integer column type"""
= dataframe.copy()
dataframe
= dataframe[column_name].str.replace(string_value, "")
dataframe[column_name]
= dataframe[column_name].astype(int)
dataframe[column_name]
return dataframe
def join_frames(left_dataframe, right_dataframe, left_column, right_column):
"""
Function to join the required frames on specified columns, dropping
unrecessary column
"""
= left_dataframe.copy()
left_dataframe = right_dataframe.copy()
right_dataframe
= left_dataframe.merge(right=right_dataframe,
combined_frames ="left",
how=left_column,
left_on=right_column)
right_on
= combined_frames.drop(labels=[right_column], axis=1)
combined_frames_reduced
return combined_frames_reduced
import pandas as pd
# Import our required modules
import input_output
import analysis
import manipulation
def get_analyse_output():
"""
Access the data, run the analysis of population density means over locations,
output the data into a csv.
"""
= input_output.load_formatted_frame("../../../../data/population_density_2019.csv")
pop_density = input_output.load_formatted_frame("../../../../data/locations.csv")
locations
# Added module names below
= manipulation.access_country_code(pop_density)
pop_density_single_code
= manipulation.convert_type_to_int(dataframe=pop_density_single_code,
pop_density_correct_types ="country_code",
column_name="CC")
string_value
= manipulation.convert_type_to_int(dataframe=locations,
locations_correct_types ="location_id",
column_name='"')
string_value
= manipulation.join_frames(left_dataframe=pop_density_correct_types,
population_location =locations_correct_types,
right_dataframe="country_code",
left_column="location_id")
right_column
# Added module name here
= analysis.aggregate_mean(dataframe=population_location,
aggreagation ="sdg_region_name",
groupby_column="population_density")
aggregate_column
= analysis.format_frame(aggreagation, "mean_population_density")
formatted_statistic
="./mean_pop_density.csv")
input_output.write_output(formatted_statistic, output_filepath
if __name__ == "__main__":
get_analyse_output()
Continue on to the case study