The coswara database is created by crowdsourcing respiratory sound samples. In this post we will visualize the crowd (or participant) distribution along certain dimensions collected in the metadata questionnaire. We will read the CSV file containing the metadata information of all the users (as on 07 August 2020). From the whole dataset, we have manually listened to 941 participants' audio samples. Below we present this data. A more detailed documentation is also available here and will be presented at the Interspeech 2020 conference.

First we visualize the gender, age, and country-wise (India/outside) distribution.

#collapse
# import some packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
sns.set_style("white")
sns.set_style("ticks")

# load CSV file
fname = 'combined_plus_annotated_IS2020.csv'
DF = pd.read_csv('./my_data/'+fname)

# plot gender information
gender_labels = DF['g'].unique()
gender_cnt = []
for i in range(len(gender_labels)):
    gender_cnt.append(len(DF[(DF['g'] == gender_labels[i]) & DF['cough-heavy-quality']]))

fig = plt.subplots(figsize=(16, 4))
ax = plt.subplot(1,3,1)
ax.bar(2,gender_cnt[0], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=.5)
ax.bar(4,gender_cnt[1], align='center',alpha=1, ecolor='black',capsize=5,hatch="//",color='blue',width=.5)
for i, v in enumerate(gender_cnt):
    ax.text(2*(i+1)-.2,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
plt.xticks([2,4], ['MALE','FEMALE'],rotation=0)
plt.ylabel('PARTICIPANT COUNT', fontsize=14)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
# ax.set_xlim(0,5)
# ax.set_ylim(200,1500)
ax.grid(True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)  


# plot age information
age_labels = DF['a'].unique()
age_cnt = []
for i in range(len(age_labels)):
    age_cnt.append(len(DF[(DF['a'] == age_labels[i])]))

ax = plt.subplot(1,3,2)
ax.bar(age_labels,age_cnt, align='center',alpha=1, ecolor='black',capsize=5,color='blue')
plt.ylabel('PARTICIPANT COUNT', fontsize=14)
plt.xlabel('AGE', fontsize=14)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
ax.grid(True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)  


# plot country information
country_labels = DF['l_c'].unique()
country_cnt = []
for i in range(len(country_labels)):
    country_cnt.append(len(DF[DF['l_c'] == country_labels[i]]))
country_cnt = np.array(country_cnt)
indx = np.argsort(country_cnt)[::-1]
country_cnt = country_cnt[indx]
country_labels = country_labels[indx]
two_categories = [country_cnt[0],np.sum(country_cnt[1:])]
two_labels = ['INDIA','OTHERS']

ax = plt.subplot(1,3,3)
ax.bar(2,two_categories[0], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=.5)
ax.bar(4,two_categories[1], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=.5)
plt.xticks([2,4],two_labels,rotation=0)
for i, v in enumerate(two_categories):
    ax.text(2*(i+1)-.25,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
ax.grid(True)
plt.ylabel('PARTICIPANT COUNT', fontsize=14)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()

India has 29 states. Next, let's visualize the across state distribution. We show only the top 5 below.

#collapse
state_labels = DF['l_s'].unique()
state_cnt = []
for i in range(len(state_labels)):
    state_cnt.append(len(DF[DF['l_s'] == state_labels[i]]))
state_cnt = np.array(state_cnt)
indx = np.argsort(state_cnt)[::-1][0:6]
state_cnt = state_cnt[indx]
state_labels = state_labels[indx]
    

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(np.arange(0,len(state_cnt)),state_cnt, align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=.25)
ax.set_ylabel('PARTICIPANT COUNT',fontsize=14)
# # ax.text(1.5,-9,'MEAN',horizontalalignment='center')
plt.xticks(np.arange(0,len(state_cnt)),state_labels,rotation=30,fontsize=13)
for i, v in enumerate(state_cnt):
    ax.text(i-.15,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
ax.grid(True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()

Next, we group the participants into two groups, healthy and unhealthy. This grouping is based on absence/presence of any respiratory ailments reported in the metadata questionnaire.

#collapse
labels = ['HEALTHY','UNHEALTHY']
category_cnt = []
category_cnt.append(len(DF.loc[(DF['covid_status']=='healthy') &\
                                  (DF['asthma']!=True)&\
                                  (DF['cld']!=True)&\
                                  (DF['cold']!=True)&\
                                  (DF['cough']!=True)&\
                                  (DF['pneumonia']!=True)&\
                                  (DF['fever']!=True)]))

category_cnt.append(len(DF.loc[(DF['covid_status']=='resp_illness_not_identified') |\
                                (DF['covid_status']=='positive_mild') |\
                                  (DF['asthma']==True)|\
                                  (DF['cld']==True)|\
                                  (DF['cold']==True)|\
                                  (DF['cough']==True)|\
                                  (DF['pneumonia']==True)|\
                                  (DF['fever']==True)]))
                        
fig = plt.subplots(figsize=(4,4))
ax = plt.subplot(1,1,1)
ax.bar(2,category_cnt[0],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=.5,label='clean')
ax.bar(4,category_cnt[1],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=.5,label='noisy')
plt.xticks([2,4],labels,rotation=0)
for i, v in enumerate(category_cnt):
    ax.text(2*(i+1)-.25,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
ax.grid(True)
plt.ylabel('COUNT', fontsize=14)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()
plt.show()

Lastly, we have listened to all the 941 participants' audio data and classified every audio file into clean, noisy, and bad (highly degraded). The resulting distribution across the nine sound categories is shown below.

#collapse
labels = ['vowel-o','vowel-e','vowel-a','cough-shallow','cough-heavy','breathing-shallow','breathing-deep',\
          'counting-normal','counting-fast']
category_cnt = []
for label in labels:
    category_cnt.append(len(DF[(DF[label]==label) & ((DF[label+'-quality']=='clean_audio'))\
                              & ((DF[label+'-cont']=='y')) & (DF[label+'-vol']=='y')]))
    category_cnt.append(len(DF[(DF[label]==label) & ((DF[label+'-quality']=='noisy_audio'))\
                              & ((DF[label+'-cont']=='y'))]))
    category_cnt.append(len(DF[(DF[label]==label) & (((DF[label+'-quality']=='bad_audio'))\
                                            |((DF[label+'-quality']=='clean_audio')&(DF[label+'-cont']=='n'))\
                                            |((DF[label+'-quality']=='noisy_audio')&(DF[label+'-cont']=='n'))\
                                            )]))    

fig = plt.subplots(figsize=(10,4))
ax = plt.subplot(1,1,1)
cnt = 0
indx = 0
xticks = []
for i in range(len(category_cnt)//3):
    if i ==0:
        ax.bar(cnt,category_cnt[indx],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=1,label='clean')
        ax.bar(cnt+1,category_cnt[indx+1],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='red',width=1,label='noisy')
        ax.bar(cnt+2,category_cnt[indx+2],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='green',width=1,label='bad')
    else:
        ax.bar(cnt,category_cnt[indx],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='blue',width=1)
        ax.bar(cnt+1,category_cnt[indx+1],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='red',width=1)
        ax.bar(cnt+2,category_cnt[indx+2],align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color='green',width=1)
    xticks.append(cnt+.5)
    cnt = cnt+4
    indx = indx+3
ax.set_xticks(xticks)
ax.set_xticklabels(labels,rotation=30,fontsize=13)
ax.grid(True)
ax.set_xlim(-2,cnt+2)
ax.legend(loc='upper right',frameon=False,bbox_to_anchor=(1.05,1),fontsize=13)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylabel('COUNT', fontsize=14)
plt.yticks(fontsize=13)
plt.show()

In another post we will attempt to describe the acoustic features of the audio samples. Looking forward to having you with us in this exploration.