Commit decbf294 authored by Quentin Aristote's avatar Quentin Aristote
Browse files

completed code

parent 7d85b105
......@@ -2,26 +2,35 @@
import bs4 as bs
import urllib.request
import os
HOSTNAME = "https://sonotheque.mnhn.fr"
URL_PATTERN = HOSTNAME + "/?q=&tbs={filters}"
URL_SEARCH_PATTERN = HOSTNAME + "/?q=&tbs={filters}"
FILTER_PATTERN = "qr:{filter}"
RESULTS_PER_PAGE_PATTERN = "ipp:{results_per_page}"
SORT_BY_PATTERN = "sbd:{sort_by}"
SINCE_PATTERN = "qdr:{since}"
URL_FILE_PATTERN = HOSTNAME + '/sounds/mnhn/so/{id}'
def getURLBase(soundscape = False, species = False,
def getBaseURL(soundscape = False, species = False,
validated = False, not_validated = False,
sort_by = None,
since = None) :
"""Return the base URL corresponding to the filters.
:param soundscape: Whether to get ambient sounds. If ambiant and species are False, get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False, get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False, get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the files should be downloaded. Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'. If None (default), all the files are downloaded."""
:param soundscape: Whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the files should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'.
If None (default), all the files are downloaded."""
filters = ['ipp:50']
......@@ -32,7 +41,7 @@ def getURLBase(soundscape = False, species = False,
if validated :
filters.append(FILTER_PATTERN.format(filter = 'v'))
if not_validated :
filters.append(FILTER.PATTERN.format(filter = 'nv'))
filters.append(FILTER_PATTERN.format(filter = 'nv'))
if sort_by :
if sort_by == 'date' :
sort_by = 'n'
......@@ -56,48 +65,122 @@ def getURLBase(soundscape = False, species = False,
raise NotImplementedError("since cannot be set to " + since)
filters.append(SINCE_PATTERN.format(since = since))
url = URL_PATTERN.format(filters = ','.join(filters))
url = URL_SEARCH_PATTERN.format(filters = ','.join(filters))
return url
def getSoup(url) :
def getHTML(url) :
"""Return the HTML code from a URL.
:param url: the URL to scrape."""
page = urllib.request.urlopen(url)
soup = bs.BeautifulSoup(page, features = 'html.parser')
return soup
html = bs.BeautifulSoup(page, features = 'html.parser')
return html
def getSoupNext(soup) :
"""Generate the HTML code of all the next pages.
def getNextHTML(html_current) :
"""Generate the HTML code of the current page as well as following ones.
:param soup: the initial HTML code to scrape for the next page."""
:param html_current: the initial HTML code to scrape for the next page."""
td = soup.find('td', attrs = {'class' : 'next'})
html = html_current
yield html
td = html.find('td', attrs = {'class' : 'next'})
while td :
url = HOSTNAME + td.find('a')['href']
soup = getSoup(url)
yield soup
html = getHTML(url)
yield html
td = html.find('td', attrs = {'class' : 'next'})
def getSoupFiles(soup) :
"""Generate the HTML code of the pages for each file listed on the current page.
def getFilesIDs(html) :
"""Generate the IDs for each file listed on the current page.
:param soup: the HTML code to scrape."""
:param html: the HTML code to scrape."""
table_rows = soup.findAll('li', attrs = {'class' : 'table-row'})
table_rows = html.findAll('li', attrs = {'class' : 'table-row'})
for table_row in table_rows :
url = HOSTNAME + table_row.find('a')['href']
yield url
id = table_row.find('span', attrs = {'class' : 'title'}).text[8:]
yield id
def getFileInfo(soup) :
"""Return a dictionnary containing information about the recording on the current page.
def getFileHTML(id) :
"""Return the URL of a file's page given its ID.
:param identifier: the ID of the file."""
url = URL_FILE_PATTERN.format(id = id)
html = getHTML(url)
return html
def downloadFile(html, path = '.') :
"""Download a file from its page. Return the filename.
:param soup: the HTML code of the recording's page."""
:param html: the HTML code to scrape.
:param name: the path from the current directory to save the file to."""
sidebar = soup.find('div', attrs = {'class' : 'sound-box'})
path = path.split('/')
filename = None
if len(path) >= 2 :
if path[-1] != '' :
filename = path[-1]
path = path[:-1]
cwd = os.getcwd()
os.chdir('/'.join(path))
url = HOSTNAME + html.find('a', attrs = {'class' : 'button raised'})['href']
filename, _ = urllib.request.urlretrieve(url, filename = filename)
os.chdir(cwd)
return filename
def getFileMetadata(html) :
"""Return a dictionnary containing information about the recording on the current page.
:param html: the HTML code of the recording's page."""
raise NotImplementedError('todo')
def getRecordings(directory = '.',
soundscape = False, species = False,
validated = False, not_validated = False,
sort_by = None,
since = None) :
"""Download all the recordings corresponding to specific filters.
:param directory: the path to save the recordings to.
:param soundscape: Whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the files should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'.
If None (default), all the files are downloaded."""
base_url = getBaseURL(soundscape = soundscape, species = species,
validated = validated, not_validated = not_validated,
sort_by = sort_by,
since = since)
base_html = getHTML(base_url)
for html in getNextHTML(base_html) :
for id in getFilesIDs(html) :
file_html = getFileHTML(id)
filename = id + '.mp3'
path = directory + '/' + filename
_ = downloadFile(file_html, path = path)
print('Downloading {filename} ...'.format(filename = filename))
return None
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment