from bs4 import BeautifulSoup
from pytube import YouTube
from itertools import chain
import re
import os
# Extracts youtube links from a given html file
def extract_youtube_links(filename):
# Import and parse a html page
soup = BeautifulSoup(open(filename), 'html.parser')
# Find and extract all direct youtube links
youtubelinks = [a.get('href') for a in soup.findAll('a', href=True) if bool(re.search('youtube', a.get('href')))]
# Find all embedded youtube videos and extract their links
embeddedyoutube = [iframe.get('src') for iframe in soup.findAll('iframe', src=True) if bool(re.search('youtube', iframe.get('src')))]
# Convert embedded links to 'watch' type links
embeddedlinks = []
for link in embeddedyoutube:
id_search = re.search('https:\/\/www\.youtube\.com\/embed\/(.*)\?', link)
if id_search:
embeddedlinks.append('https://www.youtube.com/watch?v={}'.format(id_search.group(1)))
# Concatenate embedded and normal youtube links into a single list
return youtubelinks + embeddedlinks
# Extract youtube links from all the files in the folder 'pages'
pagedir = 'pages'
files = [f for f in os.listdir(pagedir) if os.path.isfile(os.path.join(pagedir, f))]
youtubelinks = list(chain.from_iterable([extract_youtube_links('{}/{}'.format(pagedir, file)) for file in files]))
# Download the best possible quality audio from youtube for all the collected links
for i, link in enumerate(youtubelinks):
yt = YouTube(link)
print('Downloading [{}/{}]: {} {}'.format(i, len(youtubelinks), link, yt.title))
yt.streams.filter(only_audio=True).order_by('abr').first().download()