Commit cf46fd26 authored by Quentin Aristote's avatar Quentin Aristote
Browse files

started scraping code

parent 1fa97519
# -*- coding: utf-8 -*-
import bs4 as bs
import urllib.request
HOSTNAME = "https://sonotheque.mnhn.fr"
URL_PATTERN = HOSTNAME + "/?q=&tbs={filters}"
FILTER_PATTERN = "qr:{filter}"
RESULTS_PER_PAGE_PATTERN = "ipp:{results_per_page}"
SORT_BY_PATTERN = "sbd:{sort_by}"
SINCE_PATTERN = "qdr:{since}"
def getURLBase(soundscape = False, species = False,
validated = False, not_validated = False,
sort_by = None,
since = None) :
"""Return the base URL corresponding to the filters.
:param soundscape: Whether to get ambient sounds. If ambiant and species are False, get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False, get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False, get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the files should be downloaded. Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'. If None (default), all the files are downloaded."""
filters = ['ipp:50']
if soundscape :
filters.append(FILTER_PATTERN.format(filter = 'a'))
if species :
filters.append(FILTER_PATTERN.format(filter = 'e'))
if validated :
filters.append(FILTER_PATTERN.format(filter = 'v'))
if not_validated :
filters.append(FILTER.PATTERN.format(filter = 'nv'))
if sort_by :
if sort_by == 'date' :
sort_by = 'n'
elif sort_by == 'number' :
sort_by = 'n'
elif sort_by == 'title' :
sort_by = 't'
else :
raise NotImplementedError("sort_by cannot be set to " + sort_by)
filters.append(SORT_BY_PATTERN.format(sort_by = sort_by))
if since :
if since == 'day' :
since = 'd'
elif since == 'week' :
since = 'w'
elif since == 'month' :
since = 'm'
elif since == 'year' :
since = 'y'
else :
raise NotImplementedError("since cannot be set to " + since)
filters.append(SINCE_PATTERN.format(since = since))
url = URL_PATTERN.format(filters = ','.join(filters))
return url
def getSoup(url) :
"""Return the HTML code from a URL.
:param url: the URL to scrape."""
page = urllib.request.urlopen(url)
soup = bs.BeautifulSoup(page, features = 'html.parser')
return soup
def getSoupNext(soup) :
"""Generate the HTML code of all the next pages.
:param soup: the initial HTML code to scrape for the next page."""
td = soup.find('td', attrs = {'class' : 'next'})
while td :
url = HOSTNAME + td.find('a')['href']
soup = getSoup(url)
yield soup
def getSoupFiles(soup) :
"""Generate the HTML code of the pages for each file listed on the current page.
:param soup: the HTML code to scrape."""
table_rows = soup.findAll('li', attrs = {'class' : 'table-row'})
for table_row in table_rows :
url = HOSTNAME + table_row.find('a')['href']
yield url
def getFileInfo(soup) :
"""Return a dictionnary containing information about the recording on the current page.
:param soup: the HTML code of the recording's page."""
sidebar = soup.find('div', attrs = {'class' : 'sound-box'})
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment