Coswara-Data Metadata Details
Some bar plots and histograms.
The COSWARA database is created by crowdsourcing sound samples. In this post we visualize the crowd (or participant) distribution along certain dimensions collected in the metadata questionnaire. The metadata and sound files are publicly available in the COSWARA-DATA github repository. In the below code we will read a CSV file containing the metadata of all participants (as of 13 Jan 2021), and plot the metadata distributions.
Step 1:
Load the CSVs
#collapse
# import some packages
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 10, 2020
@author: neeks, cmu
"""
import numpy as np
from os import path
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import librosa
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
AutoMinorLocator)
import matplotlib as mpl
sns.set_style("white")
sns.set_style("ticks")
# urls for csvs
url_metadata_annotations = 'https://raw.githubusercontent.com/iiscleap/Coswara-Exp/master/Annotated_v2/Annotated_v2_ans.csv'
url_metadata_recordings = 'https://raw.githubusercontent.com/iiscleap/Coswara-Data/master/combined_data.csv'
path_audio_data = '/Volumes/BackupNeeks/dBase/coswaradBase/annotatedFiles/audio_19Oct2020/'
path_store_figure = './figures/'
fig_save = 0
# load the csvs
df_2 = pd.read_csv(url_metadata_recordings) # recording metadata
#collapse
DF = df_2.copy()
# plot gender
gender_labels = DF['g'].unique()[::-1]
gender_cnt = []
for i in range(len(gender_labels)):
gender_cnt.append(len(DF[(DF['g'] == gender_labels[i])]))# & (DF['recording']=='breathing-deep')]))
clr_1 = 'tab:blue'
clr_2 = 'tab:red'
fig, ax = plt.subplots(figsize=(2, 6))
ax.bar(2,gender_cnt[0], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color=clr_1,width=.6)
ax.bar(4,gender_cnt[1], align='center',alpha=1, ecolor='black',capsize=5,hatch="//",color=clr_2,width=.6)
for i, v in enumerate(gender_cnt):
ax.text(2*(i+1)-.3,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
plt.xticks([2,4], ['MALE','FEMALE'],rotation=0)
plt.ylabel('COUNT', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax.set_xlim(1,5)
# ax.set_ylim(200,1500)
ax.grid(True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# ax.figure.savefig(path_store_figure+"IS2020_coswara_metadata_gender.pdf", bbox_inches='tight')
plt.show()
#collapse
# plot age
age_labels = DF['a'].unique()
age_cnt_male = []
age_cnt_female = []
for i in range(len(age_labels)):
if age_labels[i] == 'X':
age_labels[i] = 0
age_cnt_male.append(len(DF[(DF['a'] == age_labels[i]) & (DF['g']=='male')]))
age_cnt_female.append(len(DF[(DF['a'] == age_labels[i]) & (DF['g']=='female')]))
age_cnt_male = DF[(DF['g']=='male')]['a'].values
age_cnt_female = DF[(DF['g']=='female')]['a'].values
age_labels = ['0-18', '18-30', '30-40', '40-50', '50-60', '60-70', '70-80']
age_grouped_male = []
age_grouped_female = []
for i in age_labels:
age_grouped_male.append(len(age_cnt_male[(age_cnt_male > (int(i.split('-')[0])-1)) & \
(age_cnt_male < int(i.split('-')[1]))]))
age_grouped_female.append(len(age_cnt_female[(age_cnt_female > (int(i.split('-')[0])-1)) & \
(age_cnt_female < int(i.split('-')[1]))]))
clr_1 = 'tab:blue'
clr_2 = 'tab:red'
fig, ax = plt.subplots(figsize=(7, 6))
ax.bar(np.arange(0,len(age_labels)),age_grouped_male, align='center',alpha=1,hatch="\\\\",ecolor='black',capsize=5,color=clr_1,width=.3,label='MALE')
ax.bar(np.arange(0,len(age_labels))+.3,age_grouped_female, align='center',alpha=1,hatch="\\\\",ecolor='black',capsize=5,color=clr_2,width=.3,label='FEMALE')
ax.legend(frameon=False,loc='upper right',fontsize=14)
plt.ylabel('COUNT', fontsize=14)
plt.xlabel('AGE GROUP', fontsize=14)
plt.xticks(np.arange(0,len(age_labels)), age_labels,rotation=0,fontsize=14)
plt.yticks(fontsize=14)
ax.grid(color='gray', linestyle='--', linewidth=1,alpha=.3)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# if fig_save:
# ax.figure.savefig(path_store_figure+"IS2020_coswara_metadata_age.pdf", bbox_inches='tight')
plt.show()
#collapse
# plot country
country_labels = DF['l_c'].unique()
country_cnt = []
for i in range(len(country_labels)):
country_cnt.append(len(DF[(DF['l_c'] == country_labels[i])]))
country_cnt = np.array(country_cnt)
indx = np.argsort(country_cnt)[::-1]
country_cnt = country_cnt[indx]
country_labels = country_labels[indx]
two_categories = [country_cnt[0],np.sum(country_cnt[1:])]
two_labels = ['INDIA','OUTSIDE']
fig, ax = plt.subplots(figsize=(2, 6))
ax.bar(2,two_categories[0], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color=clr_1,width=.4)
ax.bar(4,two_categories[1], align='center',alpha=1, ecolor='black',capsize=5,hatch="//",color=clr_2,width=.4)
plt.xticks([2,4],two_labels,rotation=0,fontsize=14)
for i, v in enumerate(two_categories):
ax.text(2*(i+1)-.3,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
ax.grid(color='gray', linestyle='--', linewidth=1,alpha=.3)
plt.ylabel('COUNT', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# if fig_save:
# ax.figure.savefig(path_store_figure+"IS2020_metadata_country.pdf", bbox_inches='tight')
plt.show()
#collapse
# plot state
state_labels = DF['l_s'].unique()
state_cnt = []
for i in range(len(state_labels)):
state_cnt.append(len(DF[(DF['l_s'] == state_labels[i])]))
state_cnt = np.array(state_cnt)
indx = np.argsort(state_cnt)[::-1][0:6]
state_cnt = state_cnt[indx]
state_labels = state_labels[indx]
fig, ax = plt.subplots(figsize=(6, 6))
ax.bar(np.arange(0,len(state_cnt)),state_cnt, align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color=clr_1,width=.25)
ax.set_ylabel('COUNT',fontsize=14)
# # ax.text(1.5,-9,'MEAN',horizontalalignment='center')
plt.xticks(np.arange(0,len(state_cnt)),state_labels,rotation=30,fontsize=14)
plt.yticks(fontsize=14)
for i, v in enumerate(state_cnt):
ax.text(i-.15,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
ax.grid(color='gray', linestyle='--', linewidth=1,alpha=.3)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# if fig_save:
# ax.figure.savefig(path_store_figure+"IS2020_metadata_state.pdf", bbox_inches='tight')
plt.show()
#collapse
##### init path to list
path_to_lists = '/Users/neeks/Desktop/Documents/work/code/python_codes/notebookCodes/coswara/lists/'
#####
DF_more = DF.replace('None', np.nan)
DF_more = DF.replace(np.nan, False)
DF_more['healthy'] = False
DF_more['covid'] = 0.0
DF_more['unhealthy_but_no_covid'] = False
for i in range(len(DF_more)):
# healthy
if (DF_more.at[i,'covid_status']=='healthy') & (DF_more.at[i,'asthma']==False) &\
(DF_more.at[i,'cld']==False) & (DF_more.at[i,'cold']==False) &\
(DF_more.at[i,'cough']==False) & (DF_more.at[i,'pneumonia']==False) &\
(DF_more.at[i,'fever']==False):
DF_more.at[i,'healthy'] = True
# covid
if (DF_more.at[i,'covid_status']=='positive_asymp'):
DF_more.at[i,'covid'] = 1
if (DF_more.at[i,'covid_status']=='positive_mild'):
DF_more.at[i,'covid'] = 2
if (DF_more.at[i,'covid_status']=='positive_moderate'):
DF_more.at[i,'covid'] = 3
# unhealthy but not covid
if (DF_more.at[i,'covid_status']=='resp_illness_not_identified') & ((DF_more.at[i,'asthma']==True) |\
(DF_more.at[i,'cld']==True) | (DF_more.at[i,'cold']==True) |\
(DF_more.at[i,'cough']==True) | (DF_more.at[i,'pneumonia']==True)):
DF_more.at[i,'unhealthy_but_no_covid'] = True
health_categories_cnt_all = []
# get healthy
health_categories_cnt_all.append(len(DF_more[(DF_more['healthy']==True)]))
# get covid
health_categories_cnt_all.append(len(DF_more[(DF_more['covid']>0)]))
# get resp. ail.
health_categories_cnt_all.append(len(DF_more[(DF_more['unhealthy_but_no_covid']==True)]))
clr = ['tab:blue','tab:red','tab:green']
fig, ax = plt.subplots(figsize=(5, 6))
ax.bar(2, health_categories_cnt_all[0], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color=clr[0],width=.3)
ax.bar(4, health_categories_cnt_all[1], align='center',alpha=1, ecolor='black',capsize=5,hatch="//",color=clr[1],width=.3)
ax.bar(6, health_categories_cnt_all[2], align='center',alpha=1, ecolor='black',capsize=5,hatch="\\\\",color=clr[2],width=.3)
plt.xticks([2,4,6],['HEALTHY','COVID-19','RESP. AIL \n (NOT COVID)'],rotation=0,fontsize=14)
for i, v in enumerate(health_categories_cnt_all):
ax.text(2*(i+1)-.1,v + 3, str(v), color='black', fontweight='bold',fontsize=14)
ax.grid(color='gray', linestyle='--', linewidth=1,alpha=.3)
plt.ylabel('COUNT', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xlim(1,7)
# if fig_save:
# ax.figure.savefig(path_store_figure+"IS2020_metadata_country.pdf", bbox_inches='tight')
plt.show()
In another plot we will describe the details of a subset of these files which have been annotated.