diff --git a/Scraping Code/gsoc_yearly_data_generator.py b/Scraping Code/gsoc_yearly_data_generator.py index 9018b64..b75f516 100644 --- a/Scraping Code/gsoc_yearly_data_generator.py +++ b/Scraping Code/gsoc_yearly_data_generator.py @@ -24,21 +24,31 @@ def extraction(year): file = open("Data/GSOC_"+str(year)+"_Data.ods", "w") with requests.Session() as c: - if(year!=2019): page = c.get("https://summerofcode.withgoogle.com/archive/" + str(year) + "/organizations/") - else: page = c.get("https://summerofcode.withgoogle.com/organizations/#6230025286713344") + if(year==2019): page = c.get("https://summerofcode.withgoogle.com/organizations/#6230025286713344") + elif(year>=2016): page = c.get("https://summerofcode.withgoogle.com/archive/" + str(year) + "/organizations/") + else: page = c.get("https://www.google-melange.com/archive/gsoc/" + str(year)) plain_text = page.text soup = BeautifulSoup(plain_text, "lxml") dict_year = {} gsoc_year_organizations = [] - for name in soup.findAll('h4',{'class': 'organization-card__name font-black-54'}): + gsoc_organizations = soup.findAll('h4', {'class': 'organization-card__name font-black-54'}) + if(year<2016): + gsoc_organizations = soup.find('ul', {'class': 'mdl-list'}).findChildren('a') + + for name in gsoc_organizations: title = name.string gsoc_year_organizations.append(title) dict_year[title] = [] - + i=0 - for link in soup.findAll('a',{'class': 'organization-card__link'}): + links = soup.findAll('a',{'class': 'organization-card__link'}) + domain = "https://summerofcode.withgoogle.com" + if(year<2016): + links = soup.find('ul', {'class': 'mdl-list'}).findChildren('a') + domain = "https://www.google-melange.com" + for link in links: hrefs = link.get('href') - dict_year[gsoc_year_organizations[i]].append('https://summerofcode.withgoogle.com'+hrefs) + dict_year[gsoc_year_organizations[i]].append(domain+hrefs) i+=1 count = i