kaustubhgupta · kaustubhgupta · Aug 17, 2020 · Aug 16, 2020 · Aug 16, 2020 · Aug 16, 2020
diff --git a/Basic-Scripts/tic-tac-toe.py b/Basic-Scripts/tic-tac-toe.py
diff --git a/Contribution.md b/Contribution.md
@@ -13,7 +13,7 @@
   -  Fork the [repository](https://github.com/ankitdobhal/Awesome-Python-Scripts)
 
   -  Clone the fork [repo](https://github.com/ankitdobhal/Awesome-Python-Scripts)
-     - git clone https://github.com/ankitdobhal/Awesome-Python-Scripts
+     - git clone https://github.com/<Your_Username>/Awesome-Python-Scripts
   -  Create new branch 
      - git checkout -b <Your-Branch-Name>
 
@@ -22,8 +22,9 @@
 
   -  Add a commit message !
      - git commit -a -m "<Added your message>"
+
   -  Push changes
-    - git push origin
+    - git push -u origin <name_of_your_branch>
 
   -  Create pull requests
     - [Try to Mention the related issue for your PR]
diff --git a/System-Automation-Scripts/junk_organiser.py b/System-Automation-Scripts/junk_organiser.py
@@ -0,0 +1,52 @@
+import os 
+from pathlib import Path as pt
+
+DIRECTORIES = { 
+	"HTML": [".html5", ".html", ".htm", ".xhtml"], 
+	"IMAGES": [".jpeg", ".jpg", ".tiff", ".gif", ".bmp", ".png", ".bpg", "svg", 
+			".heif", ".psd"], 
+	"VIDEOS": [".avi", ".flv", ".wmv", ".mov", ".mp4", ".webm", ".vob", ".mng", 
+			".qt", ".mpg", ".mpeg", ".3gp"], 
+	"DOCUMENTS": [".oxps", ".epub", ".pages", ".docx", ".doc", ".fdf", ".ods", 
+				".odt", ".pwi", ".xsn", ".xps", ".dotx", ".docm", ".dox", 
+				".rvg", ".rtf", ".rtfd", ".wpd", ".xls", ".xlsx", ".ppt", 
+				"pptx"], 
+	"ARCHIVES": [".a", ".ar", ".cpio", ".iso", ".tar", ".gz", ".rz", ".7z", 
+				".dmg", ".rar", ".xar", ".zip"], 
+	"AUDIO": [".aac", ".aa", ".aac", ".dvf", ".m4a", ".m4b", ".m4p", ".mp3", 
+			".msv", "ogg", "oga", ".raw", ".vox", ".wav", ".wma"], 
+	"PLAINTEXT": [".txt", ".in", ".out"], 
+	"PDF": [".pdf"], 
+	"PYTHON": [".py"],
+    "C": [".c"],
+    "CPP": [".cpp"],
+    "JAVA": [".java"], 
+	"XML": [".xml"], 
+	"EXE": [".exe"], 
+	"SHELL": [".sh"] 
+
+} 
+
+FILE_FORMATS = {file_format: directory 
+				for directory, file_formats in DIRECTORIES.items() 
+				for file_format in file_formats} 
+
+def org_junk(): 
+	for entry in os.scandir(): 
+		if entry.is_dir(): 
+			continue
+		file_path = pt(entry) 
+		file_format = file_path.suffix.lower() 
+		if file_format in FILE_FORMATS: 
+			directory_path = pt(FILE_FORMATS[file_format]) 
+			directory_path.mkdir(exist_ok=True) 
+			file_path.rename(directory_path.joinpath(file_path)) 
+
+		for dir in os.scandir(): 
+			try: 
+				os.rmdir(dir) 
+			except: 
+				pass
+
+if __name__ == "__main__": 
+	org_junk() 
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/.gitignore b/Web-Scraping/Medium-Articles-Details-Scrapping/.gitignore
@@ -0,0 +1,4 @@
+# Unnecesaary Files
+
+app/__pycahce__/
+.idea
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/README.md b/Web-Scraping/Medium-Articles-Details-Scrapping/README.md
@@ -0,0 +1,42 @@
+# Medium-Articles-Details-Scrapping
+This script will scrap details about medium articles published in a date range in the given publication. The dates are choosen randomly. If there is no article on that date, then that date is skipped. The results returned is a dataframe which can be saved in any format, currently saves as CSV. Here is the preview of the terminal:
+![](terminal-preview.PNG)
+
+# Requirements
+- numpy
+- pandas
+- bs4
+- requests
+
+# How to run?
+- Run the command: python run.py
+
+# About the Scrap class
+    A Scrapper to get details about medium articles published in a date range in a Publication by selecting random dates.
+
+    Attributes
+    ----------
+    urls_dict : dict
+        key-value pairs of the publication name with link. Example:
+        urls_dict={"The Startup":"https://medium.com/swlh"}
+
+    start_date : str
+        starting date of the search. Default: 2020-01-01
+
+    end_date : str
+        ending date of the search. Default: 2020-08-01
+
+    year : int
+        year in which search has to be done. Default: 2020
+
+    number: int
+        number of random dates you want to pick. Default: 10
+
+    Methods
+    -------
+    scrap():
+        Scrapping process will be initiated by this method.
+
+    dataframe():
+        Returns the dataframe object.
+
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/app/__init__.py b/Web-Scraping/Medium-Articles-Details-Scrapping/app/__init__.py
@@ -0,0 +1,79 @@
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import random
+import numpy as np
+import itertools
+import time
+
+
+class Scrap:
+
+    def __init__(self, urls_dict, start_date='2020-01-01', end_date='2020-08-01', number=10, year=2020):
+        self.urls = urls_dict
+        self.start = pd.to_datetime(start_date)
+        self.end = pd.to_datetime(end_date)
+        self.n = number
+        self.year = year
+        self.titles = []
+        self.sub_titles = []
+        self.article_link = []
+        self.claps = []
+        self.reading_time = []
+        self.responses = []
+        self.pubs = []
+        self.dates_list = []
+
+    def randDates(self):
+        start_u = self.start.value//10**9
+        end_u = self.end.value//10**9
+
+        return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, self.n, dtype=np.int64)).view('M8[ns]')).date
+
+    def scrap(self):
+        dates = pd.to_datetime(pd.Series(self.randDates()))
+        for i in range(len(dates)):
+            month = dates.dt.month[i]
+            day = dates.dt.day[i]
+            for publication, url in self.urls.items():
+                url = url+'/archive/{0}/{1:02d}/{2:02d}'
+                print(f'Publication: {publication}, Date: {self.year}-{month}-{day}')
+                response = requests.get(url.format(self.year, month, day), allow_redirects=True)
+                if not response.url.startswith(url.format(self.year, month, day)):
+                    continue
+                page = response.content
+                soup = BeautifulSoup(page, 'html.parser')
+                articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
+
+                number = len([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
+
+                self.titles.append([i.find('h3',class_="graf--title" ).text if i.find('h3',class_="graf--title" ) is not None else '' for i in articles])
+
+                self.sub_titles.append([i.find("h4", class_="graf--subtitle").text if i.find("h4", class_="graf--subtitle") is not None else '' for i in articles])
+
+                self.article_link.append([i.find_all('a')[3]['href'].split('?')[0] for i in articles])
+
+                self.claps.append([0 if (k is None) or (k == '') or (k.split is None) else int(float(k.split('K')[0])*1000) if len(k.split('K'))==2 else int(float(k.split('K')[0])) for k in [j.text for j in [i.find_all('button')[1] for i in articles]]])
+
+                self.reading_time.append([int(i.find("span", class_="readingTime")['title'].split()[0]) if i.find("span", class_="readingTime") is not None else 0 for i in articles])
+
+                self.responses.append([i.find_all('a')[6].text.split(' ')[0] if (len(i.find_all('a'))==7) and len(i.find_all('a')[6].text.split(' '))!=0 else 0 for i in articles])
+
+                self.pubs.append([publication]*number)
+
+                self.dates_list.append([f'{self.year}-{month}-{day}'])
+
+                time.sleep(0.3)
+
+    def dataframe(self):
+        columns = ['Title', 'SubTitle', 'Link', 'Claps', 'Reading_Time', 'Responses', 'Publication','Date_Published']
+        titles = list(itertools.chain.from_iterable(self.titles))
+        sub_titles = list(itertools.chain.from_iterable(self.sub_titles))
+        article_link = list(itertools.chain.from_iterable(self.article_link))
+        claps = list(itertools.chain.from_iterable(self.claps))
+        reading_time = list(itertools.chain.from_iterable(self.reading_time))
+        responses = list(itertools.chain.from_iterable(self.responses))
+        pubs = list(itertools.chain.from_iterable(self.pubs))
+        dates = list(itertools.chain.from_iterable(self.dates_list))
+
+        return pd.DataFrame(zip(titles, sub_titles, article_link, claps, reading_time, responses, pubs, dates), columns=columns)
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/run.py b/Web-Scraping/Medium-Articles-Details-Scrapping/run.py
@@ -0,0 +1,31 @@
+from app import Scrap
+
+print('-----------------')
+pub_name = input('Enter the comma seperated list of publication names(The Startup, Medium ...): ').split(',')
+pub_link = input('Enter the comma seperated links of publications (https://medium.com/swlh, https://towardsdatascience.com  ...): ').split(',')
+
+if len(pub_name) != len(pub_link):
+    print('Please Enter links of all publications!')
+
+pub_dict = {i: j for i, j in zip(pub_name, pub_link)}
+
+choice = input("The default information passed is:\nNumber=5\nstart_date='2019-01-01'\nend_date='2019-08-01'\nyear=2019\n\nDo you want to change it? (Y/N): ")
+
+if choice == 'Y':
+    s_date = input("Enter new start date in format (YYYY-MM-DD): ")
+    e_date = input("Enter new end date in format (YYYY-MM-DD): ")
+    new_year = int(input("Enter year: "))
+    num = int(input("Enter number of random samples: "))
+else:
+    s_date = '2019-01-01'
+    e_date = '2019-08-01'
+    new_year = 2020
+    num = 5
+
+print('Process started ...')
+a = Scrap(urls_dict=pub_dict, number=num, start_date=s_date, end_date=e_date, year=new_year)
+a.scrap()
+a.dataframe().to_csv('results.csv')
+print(a.dataframe())
+print('-----------------')
+print('Process ended... Thanks for using!')
diff --git a/Web-Scraping/Medium-Articles-Details-Scrapping/terminal-preview.PNG b/Web-Scraping/Medium-Articles-Details-Scrapping/terminal-preview.PNG
diff --git a/Web-Scraping/ScrappingHackerNewsWebsite/README.md b/Web-Scraping/ScrappingHackerNewsWebsite/README.md
@@ -0,0 +1,6 @@
+# Scraping Hacker news Website
+
+Scraping the first 2 pages of Hacker news website wherein user can read Tech news(as a articles) which has upvotes more than 100 with help of Requests and 
+Beautiful Soup Modules. User can just click on story link to see the article.
+
+Link for Hacker news Website - https://news.ycombinator.com/
diff --git a/Web-Scraping/ScrappingHackerNewsWebsite/ScrappingHackerNewsWebsite.py b/Web-Scraping/ScrappingHackerNewsWebsite/ScrappingHackerNewsWebsite.py
@@ -0,0 +1,54 @@
+'''
+Scraping the first 2 pages of Hacker news website which gives lot of Tech news(as a articles)
+which has upvotes more than 100.User can just click on story link to see the article.
+'''
+
+'''
+Program uses requests module to get web data from URL and BeautifulSoup module to parse the web data
+as HTML using html parser.
+Install requests and BeautifulSoup module before executing!
+'''
+
+import requests
+from bs4 import BeautifulSoup
+import pprint  # prints the Final output in pretty manner which is inbuilt module in Python
+
+
+response1 = requests.get("https://news.ycombinator.com/news")   #Storing response of first page of website
+response2 = requests.get("https://news.ycombinator.com/news?p=2")  # Storing response of Second page of website
+
+response1_html_parser = BeautifulSoup(response1.text,'html.parser') #parsing the received web data by html parser
+response2_html_parser = BeautifulSoup(response2.text,'html.parser')
+
+linksInPage1 = response1_html_parser.select('.storylink') #All links of tech news are included in class "Storylink"
+linksInPage2 = response2_html_parser.select('.storylink')
+
+votesInPage1 = response1_html_parser.select('.subtext') #All votes are stored inside subclass "score" of class "subtext"
+votesInPage2 = response2_html_parser.select('.subtext')
+
+
+mega_link = linksInPage1 + linksInPage2  # Combining links of both pages
+#print(mega_link)
+mega_votes = votesInPage1 + votesInPage2
+
+def sorted_stories_list(hackerNewsList):
+    """Sorting the list in decreasing order
+       with respect to votes"""
+    return sorted(hackerNewsList,key=lambda x:x['votes'],reverse=True)
+
+def create_custom_hackernews(mega_link,mega_votes):
+    hackerNews =[]
+    for index,item in enumerate(mega_link):
+        title = mega_link[index].getText()  #To get title of the story(news)
+        href = mega_link[index].get('href',None) # To get link of stroy(news).If no link is present, default is None
+        vote = mega_votes[index].select('.score') # points are stored inside class "score" of class subtext,if points/votes not available, then class score wont be present.
+        if len(vote): #To check if class "score" exists or not
+            points = int(vote[0].getText().replace(' points', ''))
+            if points > 100:  # To get votes/points more than 100
+                hackerNews.append({'title': title, 'link': href,'votes': points})
+
+    return sorted_stories_list(hackerNews)
+
+if __name__ == '__main__':
+    # Prints story link, story title and its votes in a pretty manner
+    pprint.pprint(create_custom_hackernews(mega_link,mega_votes))