Skip to content

Getting files from master of ankit to master of kaustubh #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 17, 2020
Merged
Empty file removed Basic-Scripts/tic-tac-toe.py
Empty file.
5 changes: 3 additions & 2 deletions Contribution.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
- Fork the [repository](https://github.com/ankitdobhal/Awesome-Python-Scripts)

- Clone the fork [repo](https://github.com/ankitdobhal/Awesome-Python-Scripts)
- git clone https://github.com/ankitdobhal/Awesome-Python-Scripts
- git clone https://github.com/<Your_Username>/Awesome-Python-Scripts
- Create new branch
- git checkout -b <Your-Branch-Name>

Expand All @@ -22,8 +22,9 @@

- Add a commit message !
- git commit -a -m "<Added your message>"

- Push changes
- git push origin
- git push -u origin <name_of_your_branch>

- Create pull requests
- [Try to Mention the related issue for your PR]
52 changes: 52 additions & 0 deletions System-Automation-Scripts/junk_organiser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
from pathlib import Path as pt

DIRECTORIES = {
"HTML": [".html5", ".html", ".htm", ".xhtml"],
"IMAGES": [".jpeg", ".jpg", ".tiff", ".gif", ".bmp", ".png", ".bpg", "svg",
".heif", ".psd"],
"VIDEOS": [".avi", ".flv", ".wmv", ".mov", ".mp4", ".webm", ".vob", ".mng",
".qt", ".mpg", ".mpeg", ".3gp"],
"DOCUMENTS": [".oxps", ".epub", ".pages", ".docx", ".doc", ".fdf", ".ods",
".odt", ".pwi", ".xsn", ".xps", ".dotx", ".docm", ".dox",
".rvg", ".rtf", ".rtfd", ".wpd", ".xls", ".xlsx", ".ppt",
"pptx"],
"ARCHIVES": [".a", ".ar", ".cpio", ".iso", ".tar", ".gz", ".rz", ".7z",
".dmg", ".rar", ".xar", ".zip"],
"AUDIO": [".aac", ".aa", ".aac", ".dvf", ".m4a", ".m4b", ".m4p", ".mp3",
".msv", "ogg", "oga", ".raw", ".vox", ".wav", ".wma"],
"PLAINTEXT": [".txt", ".in", ".out"],
"PDF": [".pdf"],
"PYTHON": [".py"],
"C": [".c"],
"CPP": [".cpp"],
"JAVA": [".java"],
"XML": [".xml"],
"EXE": [".exe"],
"SHELL": [".sh"]

}

FILE_FORMATS = {file_format: directory
for directory, file_formats in DIRECTORIES.items()
for file_format in file_formats}

def org_junk():
for entry in os.scandir():
if entry.is_dir():
continue
file_path = pt(entry)
file_format = file_path.suffix.lower()
if file_format in FILE_FORMATS:
directory_path = pt(FILE_FORMATS[file_format])
directory_path.mkdir(exist_ok=True)
file_path.rename(directory_path.joinpath(file_path))

for dir in os.scandir():
try:
os.rmdir(dir)
except:
pass

if __name__ == "__main__":
org_junk()
4 changes: 4 additions & 0 deletions Web-Scraping/Medium-Articles-Details-Scrapping/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Unnecesaary Files

app/__pycahce__/
.idea
42 changes: 42 additions & 0 deletions Web-Scraping/Medium-Articles-Details-Scrapping/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Medium-Articles-Details-Scrapping
This script will scrap details about medium articles published in a date range in the given publication. The dates are choosen randomly. If there is no article on that date, then that date is skipped. The results returned is a dataframe which can be saved in any format, currently saves as CSV. Here is the preview of the terminal:
![](terminal-preview.PNG)

# Requirements
- numpy
- pandas
- bs4
- requests

# How to run?
- Run the command: python run.py

# About the Scrap class
A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.

Attributes
----------
urls_dict : dict
key-value pairs of the publication name with link. Example:
urls_dict={"The Startup":"https://medium.com/swlh"}

start_date : str
starting date of the search. Default: 2020-01-01

end_date : str
ending date of the search. Default: 2020-08-01

year : int
year in which search has to be done. Default: 2020

number: int
number of random dates you want to pick. Default: 10

Methods
-------
scrap():
Scrapping process will be initiated by this method.

dataframe():
Returns the dataframe object.

79 changes: 79 additions & 0 deletions Web-Scraping/Medium-Articles-Details-Scrapping/app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import numpy as np
import itertools
import time


class Scrap:

def __init__(self, urls_dict, start_date='2020-01-01', end_date='2020-08-01', number=10, year=2020):
self.urls = urls_dict
self.start = pd.to_datetime(start_date)
self.end = pd.to_datetime(end_date)
self.n = number
self.year = year
self.titles = []
self.sub_titles = []
self.article_link = []
self.claps = []
self.reading_time = []
self.responses = []
self.pubs = []
self.dates_list = []

def randDates(self):
start_u = self.start.value//10**9
end_u = self.end.value//10**9

return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, self.n, dtype=np.int64)).view('M8[ns]')).date

def scrap(self):
dates = pd.to_datetime(pd.Series(self.randDates()))
for i in range(len(dates)):
month = dates.dt.month[i]
day = dates.dt.day[i]
for publication, url in self.urls.items():
url = url+'/archive/{0}/{1:02d}/{2:02d}'
print(f'Publication: {publication}, Date: {self.year}-{month}-{day}')
response = requests.get(url.format(self.year, month, day), allow_redirects=True)
if not response.url.startswith(url.format(self.year, month, day)):
continue
page = response.content
soup = BeautifulSoup(page, 'html.parser')
articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")

number = len([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])

self.titles.append([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])

self.sub_titles.append([i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles])

self.article_link.append([i.find_all('a')[3]['href'].split('?')[0] for i in articles])

self.claps.append([0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]])

self.reading_time.append([int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles])

self.responses.append([i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles])

self.pubs.append([publication]*number)

self.dates_list.append([f'{self.year}-{month}-{day}'])

time.sleep(0.3)

def dataframe(self):
columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses', 'Publication','Date_Published']
titles = list(itertools.chain.from_iterable(self.titles))
sub_titles = list(itertools.chain.from_iterable(self.sub_titles))
article_link = list(itertools.chain.from_iterable(self.article_link))
claps = list(itertools.chain.from_iterable(self.claps))
reading_time = list(itertools.chain.from_iterable(self.reading_time))
responses = list(itertools.chain.from_iterable(self.responses))
pubs = list(itertools.chain.from_iterable(self.pubs))
dates = list(itertools.chain.from_iterable(self.dates_list))

return pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses, pubs, dates), columns=columns)
31 changes: 31 additions & 0 deletions Web-Scraping/Medium-Articles-Details-Scrapping/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from app import Scrap

print('-----------------')
pub_name = input('Enter the comma seperated list of publication names(The Startup, Medium ...): ').split(',')
pub_link = input('Enter the comma seperated links of publications (https://medium.com/swlh, https://towardsdatascience.com ...): ').split(',')

if len(pub_name) != len(pub_link):
print('Please Enter links of all publications!')

pub_dict = {i: j for i, j in zip(pub_name, pub_link)}

choice = input("The default information passed is:\nNumber=5\nstart_date='2019-01-01'\nend_date='2019-08-01'\nyear=2019\n\nDo you want to change it? (Y/N): ")

if choice == 'Y':
s_date = input("Enter new start date in format (YYYY-MM-DD): ")
e_date = input("Enter new end date in format (YYYY-MM-DD): ")
new_year = int(input("Enter year: "))
num = int(input("Enter number of random samples: "))
else:
s_date = '2019-01-01'
e_date = '2019-08-01'
new_year = 2020
num = 5

print('Process started ...')
a = Scrap(urls_dict=pub_dict, number=num, start_date=s_date, end_date=e_date, year=new_year)
a.scrap()
a.dataframe().to_csv('results.csv')
print(a.dataframe())
print('-----------------')
print('Process ended... Thanks for using!')
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 6 additions & 0 deletions Web-Scraping/ScrappingHackerNewsWebsite/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Scraping Hacker news Website

Scraping the first 2 pages of Hacker news website wherein user can read Tech news(as a articles) which has upvotes more than 100 with help of Requests and
Beautiful Soup Modules. User can just click on story link to see the article.

Link for Hacker news Website - https://news.ycombinator.com/
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
'''
Scraping the first 2 pages of Hacker news website which gives lot of Tech news(as a articles)
which has upvotes more than 100.User can just click on story link to see the article.
'''

'''
Program uses requests module to get web data from URL and BeautifulSoup module to parse the web data
as HTML using html parser.
Install requests and BeautifulSoup module before executing!
'''

import requests
from bs4 import BeautifulSoup
import pprint # prints the Final output in pretty manner which is inbuilt module in Python


response1 = requests.get("https://news.ycombinator.com/news") #Storing response of first page of website
response2 = requests.get("https://news.ycombinator.com/news?p=2") # Storing response of Second page of website

response1_html_parser = BeautifulSoup(response1.text,'html.parser') #parsing the received web data by html parser
response2_html_parser = BeautifulSoup(response2.text,'html.parser')

linksInPage1 = response1_html_parser.select('.storylink') #All links of tech news are included in class "Storylink"
linksInPage2 = response2_html_parser.select('.storylink')

votesInPage1 = response1_html_parser.select('.subtext') #All votes are stored inside subclass "score" of class "subtext"
votesInPage2 = response2_html_parser.select('.subtext')


mega_link = linksInPage1 + linksInPage2 # Combining links of both pages
#print(mega_link)
mega_votes = votesInPage1 + votesInPage2

def sorted_stories_list(hackerNewsList):
"""Sorting the list in decreasing order
with respect to votes"""
return sorted(hackerNewsList,key=lambda x:x['votes'],reverse=True)

def create_custom_hackernews(mega_link,mega_votes):
hackerNews =[]
for index,item in enumerate(mega_link):
title = mega_link[index].getText() #To get title of the story(news)
href = mega_link[index].get('href',None) # To get link of stroy(news).If no link is present, default is None
vote = mega_votes[index].select('.score') # points are stored inside class "score" of class subtext,if points/votes not available, then class score wont be present.
if len(vote): #To check if class "score" exists or not
points = int(vote[0].getText().replace(' points', ''))
if points > 100: # To get votes/points more than 100
hackerNews.append({'title': title, 'link': href,'votes': points})

return sorted_stories_list(hackerNews)

if __name__ == '__main__':
# Prints story link, story title and its votes in a pretty manner
pprint.pprint(create_custom_hackernews(mega_link,mega_votes))