Commit 13dea479 authored by Quentin Aristote's avatar Quentin Aristote
Browse files

don't overwrite already existing files

parent 517d16f8
......@@ -3,6 +3,7 @@
import bs4 as bs
import urllib.request
import os
import os.path
HOSTNAME = "https://sonotheque.mnhn.fr"
......@@ -21,15 +22,15 @@ def getBaseURL(soundscape = False, species = False,
since = None) :
"""Return the base URL corresponding to the filters.
:param soundscape: Whether to get ambient sounds. If ambiant and species are False,
:param soundscape: whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False,
:param species: whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False,
:param validated: whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the files should be downloaded.
:param sort_by: the (decreasing) order in which the files should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'.
:param since: how recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'.
If None (default), all the files are downloaded."""
filters = ['ipp:50']
......@@ -124,15 +125,12 @@ def downloadFile(html, path = '.') :
:param html: the HTML code to scrape.
:param name: the path from the current directory to save the file to."""
path = path.split('/')
filename = None
if len(path) >= 2 :
if path[-1] != '' :
filename = path[-1]
path = path[:-1]
directory, filename = os.path.split(path)
if filename == '' :
filename = None
cwd = os.getcwd()
os.chdir('/'.join(path))
os.chdir(directory)
url = HOSTNAME + html.find('a', attrs = {'class' : 'button raised'})['href']
filename, _ = urllib.request.urlretrieve(url, filename = filename)
......@@ -154,7 +152,8 @@ def getRecordings(directory = '.',
soundscape = False, species = False,
validated = False, not_validated = False,
sort_by = None,
since = None) :
since = None,
overwrite = False) :
"""Download all the recordings corresponding to specific filters.
:param directory: the path to save the recordings to.
......@@ -167,7 +166,8 @@ def getRecordings(directory = '.',
:param sort_by: The (decreasing) order in which the files should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded files should be. Can be 'day', 'week', 'month' or 'year'.
If None (default), all the files are downloaded."""
If None (default), all the files are downloaded.
:param overwrite: download each file again."""
base_url = getBaseURL(soundscape = soundscape, species = species,
validated = validated, not_validated = not_validated,
......@@ -177,11 +177,12 @@ def getRecordings(directory = '.',
for html in getNextHTML(base_html) :
for id in getFilesIDs(html) :
file_html = getFileHTML(id)
filename = id + '.mp3'
path = directory + '/' + filename
_ = downloadFile(file_html, path = path)
print('Downloading {filename} ...'.format(filename = filename))
path = os.path.join(directory, filename)
if not(os.path.isfile(path)) or overwrite :
file_html = getFileHTML(id)
_ = downloadFile(file_html, path = path)
print('Downloading {filename} ...'.format(filename = filename))
return None
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment