import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
pio.renderers.default ='notebook'


harvest=pd.read_excel("harvest.xlsx", sheet_name=2)


harvest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   SN                               50 non-null     int64         
 1   Name                             50 non-null     object        
 2   Gender                           50 non-null     object        
 3   Age                              50 non-null     int64         
 4   District                         50 non-null     object        
 5   Latitude                         50 non-null     float64       
 6   Longitude                        50 non-null     float64       
 7   Date                             50 non-null     datetime64[ns]
 8   Land Size                        50 non-null     int64         
 9   Land ID                          50 non-null     object        
 10  Crops                            50 non-null     object        
 11  Updated Yield                    50 non-null     int64         
 12  Updated Income from Agriculture  50 non-null     int64         
 13  Updated Total income             50 non-null     float64       
 14  # Trained                        50 non-null     int64         
 15  Satisfaction                     50 non-null     int64         
 16  Perception                       50 non-null     int64         
dtypes: datetime64[ns](1), float64(3), int64(8), object(5)
memory usage: 6.8+ KB


gender_count_respondents=harvest.Gender.value_counts()
plt.style.use("ggplot")
ax=harvest.Gender.value_counts().plot(kind="bar",
                                      figsize = [4,3],
                                      width = .4,
                                      color = ["cornflowerblue", "hotpink"],
                                      ylim = [0,40])
ax.set_xticklabels(["Male","Female"], rotation="horizontal")
ax.bar_label(ax.containers[0], padding=5)
plt.title("# Respondents' Gender")
plt.xlabel("Gender")
plt.show()


def age_group(Age):
    if Age > 50:
        return "Over 50"
    elif Age >= 41:
        return "41 to 50"
    elif Age >=31:
        return "31 to 40"
    elif Age >=21:
        return "21 to 30"
    else:
        return "20 or Under"
respondent_ageGroup=harvest.Age.apply(age_group).value_counts(normalize=True).mul(100).round(1).sort_index()


respondent_ageGroup

20 or Under     6.0
21 to 30       24.0
31 to 40       32.0
41 to 50       24.0
Over 50        14.0
Name: Age, dtype: float64


plt.pie(respondent_ageGroup, 
        labels=respondent_ageGroup.index, 
        autopct = "%1.0f%%", 
        wedgeprops={'linewidth':1, 'edgecolor': 'dimgrey'})
plt.title("Age Group of Respondents")
plt.show()


ax=sns.boxplot(data=harvest, x="Gender", y="Age")
ax.set_xticklabels(["Male", "Female"])
plt.title("Age Distribution by Gender")
plt.show()


baseline=pd.read_excel("harvest.xlsx", sheet_name=0)
baseline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   SN                                50 non-null     int64         
 1   Name                              50 non-null     object        
 2   Gender                            50 non-null     object        
 3   Age                               50 non-null     int64         
 4   District                          50 non-null     object        
 5   Latitude                          50 non-null     float64       
 6   Longitude                         50 non-null     float64       
 7   Date                              50 non-null     datetime64[ns]
 8   Land Size                         50 non-null     int64         
 9   Land ID                           50 non-null     object        
 10  Crops                             50 non-null     object        
 11  Baseline Yield                    50 non-null     int64         
 12  Baseline Income from Agriculture  50 non-null     int64         
 13  Baseline Total income             50 non-null     float64       
dtypes: datetime64[ns](1), float64(3), int64(5), object(5)
memory usage: 5.6+ KB


combined_baseline=harvest.merge(baseline, on="Land ID", suffixes=("_harvest", "_baseline"))


combined_baseline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   SN_harvest                        50 non-null     int64         
 1   Name_harvest                      50 non-null     object        
 2   Gender_harvest                    50 non-null     object        
 3   Age_harvest                       50 non-null     int64         
 4   District_harvest                  50 non-null     object        
 5   Latitude_harvest                  50 non-null     float64       
 6   Longitude_harvest                 50 non-null     float64       
 7   Date_harvest                      50 non-null     datetime64[ns]
 8   Land Size_harvest                 50 non-null     int64         
 9   Land ID                           50 non-null     object        
 10  Crops_harvest                     50 non-null     object        
 11  Updated Yield                     50 non-null     int64         
 12  Updated Income from Agriculture   50 non-null     int64         
 13  Updated Total income              50 non-null     float64       
 14  # Trained                         50 non-null     int64         
 15  Satisfaction                      50 non-null     int64         
 16  Perception                        50 non-null     int64         
 17  SN_baseline                       50 non-null     int64         
 18  Name_baseline                     50 non-null     object        
 19  Gender_baseline                   50 non-null     object        
 20  Age_baseline                      50 non-null     int64         
 21  District_baseline                 50 non-null     object        
 22  Latitude_baseline                 50 non-null     float64       
 23  Longitude_baseline                50 non-null     float64       
 24  Date_baseline                     50 non-null     datetime64[ns]
 25  Land Size_baseline                50 non-null     int64         
 26  Crops_baseline                    50 non-null     object        
 27  Baseline Yield                    50 non-null     int64         
 28  Baseline Income from Agriculture  50 non-null     int64         
 29  Baseline Total income             50 non-null     float64       
dtypes: datetime64[ns](2), float64(6), int64(13), object(9)
memory usage: 12.1+ KB


combined_baseline[["District_harvest", "Updated Yield", "Baseline Yield"]].head()


df=combined_baseline.groupby("District_harvest").agg({"Updated Yield": "sum", "Baseline Yield": "sum"})

df


district=df.index
updated=df["Updated Yield"]
baseline=df["Baseline Yield"]


plt.figure(figsize=[8,6])
plt.bar(district, updated, 
        width = 0.4, 
        align="edge", 
        color = "cornflowerblue", 
        label = "Current Yield")
plt.bar(district, baseline, 
        align="edge", 
        width = -0.4, 
        color = "orange", 
        label = "Baseline Yield")
plt.legend(shadow=True)
plt.title("Baseline vs Current Yield")
plt.xlabel("District")
plt.ylabel("Total yield in KG")
plt.tight_layout()
plt.show()


df4=combined_baseline[["District_harvest", "Baseline Yield", "Updated Yield"]]


df5=combined_baseline.rename(columns=
                             {"District_harvest": "District", "Crops_harvest": "Crops", "Updated Yield": "Current Yield"}
                            )


plt.figure(figsize=[10,8])
ax1=sns.catplot(data=df5, 
                x = "District", 
                y="Baseline Yield", 
                kind="bar", 
                col="Crops", 
                color="orange", 
                errorbar=("ci",0), 
                estimator=sum)

plt.ylim(0,11000)
ax1.set_axis_labels("", "Baseline Yield")

plt.subplots_adjust(top=0.85)
plt.suptitle("Baseline and Current Yield per crops", va="bottom", size=20)

ax2=sns.catplot(data=df5, 
                x = "District", 
                y="Current Yield", 
                kind="bar", 
                col="Crops", 
                color="cornflowerblue", 
                errorbar=("ci",0), 
                estimator=sum)

plt.tight_layout()
plt.show()

<Figure size 1000x800 with 0 Axes>


df1=combined_baseline.groupby("District_harvest").agg(
    {"Updated Income from Agriculture": "sum",
    "Baseline Income from Agriculture": "sum"}
)
df1


district1=df1.index
updatedIncome=df1["Updated Income from Agriculture"]
baselineIncome=df1["Baseline Income from Agriculture"]


plt.figure(figsize=[8,6])
plt.bar(district1, updatedIncome, 
        width = 0.4, 
        align="edge", 
        color = "cornflowerblue", 
        label = "Current Agricultural Income")

plt.bar(district1, baselineIncome, 
        align="edge", 
        width = -0.4, 
        color = "orange", 
        label = "Baseline Agricultural Income")

plt.legend(shadow=True)
plt.title("Baseline vs Current Agricultural Income")
plt.xlabel("District")
plt.ylabel("Total income in USD")
plt.tight_layout()
plt.show()


df2=combined_baseline.groupby("District_harvest").agg(
    {"Updated Total income": "sum", 
     "Baseline Total income": "sum"}
)
df2


district2=df2.index
upTotalIncome=df2["Updated Total income"]
baseTotalIncome=df2["Baseline Total income"]


plt.figure(figsize=[8,6])
plt.bar(district2, upTotalIncome, 
        width = 0.4, 
        align="edge", 
        color = "cornflowerblue", 
        label = "Current Total")

plt.bar(district1, baseTotalIncome, 
        align="edge", 
        width = -0.4, 
        color = "orange", 
        label = "Baseline Total")

plt.legend(shadow=True)
plt.title("Baseline vs Current Total Income")
plt.xlabel("District")
plt.ylabel("Total income in USD")
plt.tight_layout()
plt.show()


sns.scatterplot(data=harvest, 
                x="Updated Income from Agriculture", 
                y = "Updated Total income")

plt.ylabel("Total Income")
plt.xlabel("Agricultural Income")
plt.title("Relation Between Current Agricultural and Total Income")
plt.show()


ax=sns.relplot(data=harvest, 
               x="Updated Income from Agriculture", 
               y = "Updated Total income", 
               kind="scatter", 
               col = "Crops")

ax.set_axis_labels("Agricultural Income", "Total Income", size = 15)

plt.subplots_adjust(top=0.85)
plt.suptitle("Current Total and Agricultural Income by Crops Type", size = 20)
plt.show()


gender_count_trainees=harvest.groupby("Gender")["# Trained"].sum().sort_values(ascending=False)
gender_count_trainees

Gender
M    54
F    22
Name: # Trained, dtype: int64


ax=gender_count_trainees.plot(kind="bar",
                            figsize = [4,3],
                            width = .4,
                            color = ["cornflowerblue", "hotpink"],
                            ylim = [0,60])

ax.set_xticklabels(["Male","Female"], rotation="horizontal")
ax.bar_label(ax.containers[0], padding=3)
plt.title("# Trainees' Gender")
plt.xlabel("Gender")
plt.show()


harvest.Satisfaction.mean()

3.44


df3=harvest[["District", "Satisfaction", "Gender", "Perception"]]


plt.figure(figsize=[4,3])
ax=sns.barplot(df3, 
               x="Gender", 
               y="Satisfaction", 
               errorbar=("ci",0), 
               width = 0.4, 
               palette = ["cornflowerblue", "hotpink"])

ax.set_xticklabels(["Male","Female"], rotation="horizontal")
ax.bar_label(ax.containers[0], fmt="%.2f", padding=5)
plt.title("Community Satisfaction by Gender")
plt.ylim(0,5)
plt.show()


ax=sns.barplot(df3, 
               x="District", 
               y="Satisfaction", 
               errorbar=("ci",0), 
               hue="Gender", 
               palette = ["cornflowerblue", "hotpink"])

new_labels = ["Male", "Female"]
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, new_labels, facecolor="white")
plt.ylim(0,5)
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f")
plt.title("Community Satisfaction by Location and Gender")
plt.tight_layout()
plt.show()


ax=sns.displot(data=harvest, 
               kind="hist", 
               x="Satisfaction", 
               bins=5, 
               col="Gender", 
               hue="Gender", 
               palette = ["cornflowerblue", "hotpink"], 
               alpha=1, 
               facet_kws=dict(margin_titles=True), 
               legend=False)

new_titles = ["Male", "Female"]
for i, ax in enumerate(ax.axes.flat):
    ax.set_title(new_titles[i], fontsize=12, weight="light")

plt.subplots_adjust(top=0.85)
plt.suptitle("Community Satisfaction by Gender", size=15)
    
plt.show()


df3.Perception.mean()

3.46


plt.figure(figsize=[4,3])
ax=sns.barplot(df3, 
               x = "Gender", 
               y = "Perception", 
               width=0.4, 
               errorbar=("ci",0), 
               palette=["cornflowerblue", "hotpink"])

plt.ylim(0,5)
ax.bar_label(ax.containers[0], fmt="%.2f", padding=5)
ax.set_xticklabels(["Male", "Female"], rotation="horizontal")
plt.title("Impact Perception by Gender")
plt.show()


ax=sns.barplot(df3, 
               x="District", 
               y="Perception", 
               hue="Gender", 
               errorbar=("ci",0), 
               palette=["cornflowerblue", "hotpink"])

new_labels = ["Male", "Female"]
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, new_labels, facecolor="white")
for container in ax.containers:
    ax.bar_label(container, fmt="%.2f")
plt.ylim(0,5)
plt.title("Impact Perception by Location")
plt.tight_layout()
plt.show()


ax=sns.displot(data=harvest, 
               kind="hist", 
               x="Perception", 
               bins=5, col="Gender", 
               hue="Gender", 
               palette = ["cornflowerblue", "hotpink"], 
               alpha=1, facet_kws=dict(margin_titles=True), 
               legend=False)

new_titles = ["Male", "Female"]
for i, ax in enumerate(ax.axes.flat):
    ax.set_title(new_titles[i], fontsize=12, weight="light")

plt.subplots_adjust(top=0.85)
plt.suptitle("Impact Perception by Gender", size=15)
    
plt.show()


infrastructure=pd.read_excel("harvest.xlsx", sheet_name=4, index_col="District")


infrastructure.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Mogadishu to Burco
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Location           5 non-null      object        
 1   Structure ID       5 non-null      int64         
 2   Type of Structure  5 non-null      object        
 3   Size (cubic m)     5 non-null      int64         
 4   Cost (USD)         5 non-null      int64         
 5   Capacity (litres)  5 non-null      int64         
 6   Volume Harvested   5 non-null      int64         
 7   Harvested Date     5 non-null      datetime64[ns]
 8   Frequency          5 non-null      object        
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 400.0+ bytes


infrastructure[["Type of Structure", "Capacity (litres)", "Volume Harvested", "Frequency"]]

	Updated Yield	Baseline Yield
District_harvest
Baidowa	11745	7700
Burco	11725	7400
Garowe	12385	8700
Hargeisa	13595	8600
Mogadishu	10680	7300

	Updated Income from Agriculture	Baseline Income from Agriculture
District_harvest
Baidowa	35713	22970
Burco	34790	21790
Garowe	52126	34825
Hargeisa	43036	26035
Mogadishu	42978	29525

	Updated Total income	Baseline Total income
District_harvest
Baidowa	55645.6350	35152.70
Burco	55131.9000	34786.50
Garowe	74401.8375	51706.65
Hargeisa	64396.0250	38878.60
Mogadishu	68049.2200	46688.40

	Type of Structure	Capacity (litres)	Volume Harvested	Frequency
District
Mogadishu	Reservoir	150000	90000	weekly
Baidowa	Reservoir	120000	72000	weekly
Garowe	Reservoir	90000	54000	weekly
Hargeisa	Reservoir	90000	54000	weekly
Burco	Reservoir	75000	45000	weekly

Water Harvesting Project¶

Data Analysis and Visualization using Python (Pandas, Matplotlib, and Seaborn Libraries)¶

1) Importing required libraries¶

2) Importing relevant excel file¶

3) To show gender distribution of respondents¶

4) To show age distribution of respondents¶

5) Yield comparision (Baseline vs Current)¶

Baseline and Current yield comparision based on type of crops¶

6) Agricultural income comparision (Baseline vs Current)¶

7) Total income comparision (Baseline vs Current)¶

8) Income relation (Agricultural and Total)¶

Scatterplot using Seaborn Libraries¶

9) Gender distribution of training participants¶

10) Community Satisfaction¶

Barplot using seaborn libraries¶

11) Impact perception - average total, as per location, as per gender¶

12) Reservoir capacity and harvest per week¶

	District_harvest	Updated Yield	Baseline Yield
0	Mogadishu	1600	800
1	Mogadishu	875	500
2	Mogadishu	840	800
3	Mogadishu	1500	1000
4	Mogadishu	630	600