diff --git a/Web-Scraping/Medium-article-downloader/Article_downloader.py b/Web-Scraping/Medium-article-downloader/Article_downloader.py index 3e8eaf3..f5178ab 100644 --- a/Web-Scraping/Medium-article-downloader/Article_downloader.py +++ b/Web-Scraping/Medium-article-downloader/Article_downloader.py @@ -1,40 +1,71 @@ -#!/usr/bin/env python3 -#Imports and dependencies - -import requests from bs4 import BeautifulSoup +import requests +import re +import os +import shutil -def download_article(): - - #The URL of the article is entered here - page_url = input("Enter the URL of the Medium Article ") +ARTICLE_DIR = 'article' - #On looking for "my user agent", can be used to retrieve the value" - headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'} +def get_article_content(url: str) -> tuple: + """ + This method scrapes and saves all contents in the article using + beautiful soup and a handy regex. + It returns a bs4 object and the filename. + """ + response = requests.get(url, timeout=10) - response = requests.get(page_url) + soup = BeautifulSoup(response.content, "html.parser") + filename = soup.find('h1').text.replace(' ', '_') - soup = BeautifulSoup(response.text,"html.parser") + relevant = soup.find_all(re.compile(r'p|li|h[0-4]+|span'),id=re.compile(r'[a-z0-9]{4}')) - filename = soup.find('h1').text.replace(' ', '_') + + if not os.path.exists(ARTICLE_DIR): + os.mkdir(ARTICLE_DIR) - #The content is written into a text file + content = '' + for i in relevant: + content += i.text + '\n' - file = open(filename, "w") + with open(f'{ARTICLE_DIR}/{filename}.txt', 'w') as f: + f.write(content.strip()) + return soup, filename - #The content of the article is stored in the
tag +def save_images(soup: BeautifulSoup) -> None: + """ + Saves the all images (highest quality) in the article body. + """ + i = 1 + image_dir = f'{ARTICLE_DIR}/images' + if not os.path.exists(image_dir): + os.mkdir(image_dir) - for line in soup.find('article').find('div'): - - #All the content is essentially stored between

tags - - for content in line.find_all('p'): + for img in soup.find_all('source'): + try: + link = img['srcset'].split(',')[-1].split(' ')[1] + img_data = requests.get(link, timeout=10).content + with open(f'{image_dir}/image_{i}.png', 'wb') as f: + f.write(img_data) + i += 1 + except KeyError: + continue - #contents are written into a file - - file.write(content.text + '\n') +def compress_and_cleanup_files(directory: str, filename: str) -> None: + """ + Zips article content and deletes the directory. + """ + shutil.make_archive(filename, 'zip', directory) + shutil.rmtree(directory) - file.close() +def main(url): + """ + Runs all the functions in order. + """ + soup, filename = get_article_content(url) + save_images(soup) + compress_and_cleanup_files(ARTICLE_DIR, filename) if __name__ == "__main__": - download_article() + article_url = input("Enter the URL of the Medium Article: ") + # article_url = "https://medium.com/pytorch/accelerate-pytorch-with-ipex-and-onednn-using-intel-bf16-technology-dca5b8e6b58f" + main(article_url) \ No newline at end of file