Commit f0d9302c authored by Quentin Aristote's avatar Quentin Aristote
Browse files

renamed id to title and file to sound

parent c6e5bc88
# -*- coding: utf-8 -*-
import bs4 as bs
import urllib.request
import os
import os.path
HOSTNAME = "https://sonotheque.mnhn.fr"
URL_SEARCH_PATTERN = HOSTNAME + "/?q=&tbs={filters}"
FILTER_PATTERN = "qr:{filter}"
RESULTS_PER_PAGE_PATTERN = "ipp:{results_per_page}"
SORT_BY_PATTERN = "sbd:{sort_by}"
SINCE_PATTERN = "qdr:{since}"
URL_SOUND_PATTERN = HOSTNAME + '/sounds/mnhn/so/{title}'
def getBaseURL(soundscape = False, species = False,
validated = False, not_validated = False,
sort_by = None,
since = None) :
"""Return the base URL corresponding to the filters.
:param soundscape: whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: the (decreasing) order in which the sounds should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: how recent the downloaded sounds should be. Can be 'day', 'week', 'month' or
'year'. If None (default), all the sounds are downloaded."""
filters = ['ipp:50']
if soundscape :
filters.append(FILTER_PATTERN.format(filter = 'a'))
if species :
filters.append(FILTER_PATTERN.format(filter = 'e'))
if validated :
filters.append(FILTER_PATTERN.format(filter = 'v'))
if not_validated :
filters.append(FILTER_PATTERN.format(filter = 'nv'))
if sort_by :
if sort_by == 'date' :
sort_by = 'n'
elif sort_by == 'number' :
sort_by = 'n'
elif sort_by == 'title' :
sort_by = 't'
else :
raise NotImplementedError("sort_by cannot be set to " + sort_by)
filters.append(SORT_BY_PATTERN.format(sort_by = sort_by))
if since :
if since == 'day' :
since = 'd'
elif since == 'week' :
since = 'w'
elif since == 'month' :
since = 'm'
elif since == 'year' :
since = 'y'
else :
raise NotImplementedError("since cannot be set to " + since)
filters.append(SINCE_PATTERN.format(since = since))
url = URL_SEARCH_PATTERN.format(filters = ','.join(filters))
return url
def getHTML(url) :
"""Return the HTML code from a URL.
:param url: the URL to scrape."""
page = urllib.request.urlopen(url)
html = bs.BeautifulSoup(page, features = 'html.parser')
return html
def getNextHTML(html_current) :
"""Generate the HTML code of the current page as well as following ones.
:param html_current: the initial HTML code to scrape for the next page."""
html = html_current
yield html
button = html.find('td', attrs = {'class' : 'next'})
if button :
button = button.find('a')
while button :
url = HOSTNAME + button['href']
html = getHTML(url)
yield html
button = html.find('td', attrs = {'class' : 'next'}).find('a')
def getSoundsTitles(html) :
"""Generate the titles for each sound listed on the current page.
:param html: the HTML code to scrape."""
table_rows = html.findAll('li', attrs = {'class' : 'table-row'})
for table_row in table_rows :
title = table_row.find('span', attrs = {'class' : 'title'}).text[8:]
yield title
def getSoundHTML(title) :
"""Return the URL of a sound's page given its title.
:param title: the title of the sound."""
url = URL_SOUND_PATTERN.format(title = title)
html = getHTML(url)
return html
def downloadSound(html, path = '.') :
"""Download a sound from its page. Return the filename.
:param html: the HTML code to scrape.
:param name: the path from the current directory to save the sound to."""
try :
url = HOSTNAME + html.find('a', attrs = {'class' : 'button raised'})['href']
except :
raise FileNotFoundError('the sound does not seem available for download.')
directory, filename = os.path.split(path)
if filename == '' :
filename = None
cwd = os.getcwd()
os.chdir(directory)
filename, _ = urllib.request.urlretrieve(url, filename = filename)
os.chdir(cwd)
return filename
def getSoundMetadata(html) :
"""Return a dictionnary containing information about the sound on the current page.
:param html: the HTML code of the recording's page."""
raise NotImplementedError('todo')
def getRecordings(directory = '.',
soundscape = False, species = False,
validated = False, not_validated = False,
sort_by = None,
since = None,
overwrite = False) :
"""Download all the recordings corresponding to specific filters.
:param directory: the path to save the recordings to.
:param soundscape: Whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the sounds should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded sounds should be. Can be 'day', 'week', 'month' or
'year'. If None (default), all the sounds are downloaded.
:param overwrite: whether to download each sound again."""
base_url = getBaseURL(soundscape = soundscape, species = species,
validated = validated, not_validated = not_validated,
sort_by = sort_by,
since = since)
base_html = getHTML(base_url)
for html in getNextHTML(base_html) :
for title in getSoundsTitles(html) :
filename = title + '.mp3'
path = os.path.join(directory, filename)
print('Downloading {filename} ...'.format(filename = filename))
if not(os.path.isfile(path)) or overwrite :
sound_html = getFileSound(title)
try :
_ = downloadSound(sound_html, path = path)
print('Success.')
except Exception as e :
print('Failure : {exception}'.format(exception = e))
else :
print('Sound already downloaded.')
return None
if __name__ == '__main__' :
getRecordings(directory = 'sounds', soundscape = True, validated = True)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment