Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Quentin Aristote
Soundscapes
Commits
f0d9302c
Commit
f0d9302c
authored
Nov 30, 2019
by
Quentin Aristote
Browse files
renamed id to title and file to sound
parent
c6e5bc88
Changes
1
Hide whitespace changes
Inline
Side-by-side
data/get_sounds.py
0 → 100644
View file @
f0d9302c
# -*- coding: utf-8 -*-
import
bs4
as
bs
import
urllib.request
import
os
import
os.path
HOSTNAME
=
"https://sonotheque.mnhn.fr"
URL_SEARCH_PATTERN
=
HOSTNAME
+
"/?q=&tbs={filters}"
FILTER_PATTERN
=
"qr:{filter}"
RESULTS_PER_PAGE_PATTERN
=
"ipp:{results_per_page}"
SORT_BY_PATTERN
=
"sbd:{sort_by}"
SINCE_PATTERN
=
"qdr:{since}"
URL_SOUND_PATTERN
=
HOSTNAME
+
'/sounds/mnhn/so/{title}'
def
getBaseURL
(
soundscape
=
False
,
species
=
False
,
validated
=
False
,
not_validated
=
False
,
sort_by
=
None
,
since
=
None
)
:
"""Return the base URL corresponding to the filters.
:param soundscape: whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: the (decreasing) order in which the sounds should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: how recent the downloaded sounds should be. Can be 'day', 'week', 'month' or
'year'. If None (default), all the sounds are downloaded."""
filters
=
[
'ipp:50'
]
if
soundscape
:
filters
.
append
(
FILTER_PATTERN
.
format
(
filter
=
'a'
))
if
species
:
filters
.
append
(
FILTER_PATTERN
.
format
(
filter
=
'e'
))
if
validated
:
filters
.
append
(
FILTER_PATTERN
.
format
(
filter
=
'v'
))
if
not_validated
:
filters
.
append
(
FILTER_PATTERN
.
format
(
filter
=
'nv'
))
if
sort_by
:
if
sort_by
==
'date'
:
sort_by
=
'n'
elif
sort_by
==
'number'
:
sort_by
=
'n'
elif
sort_by
==
'title'
:
sort_by
=
't'
else
:
raise
NotImplementedError
(
"sort_by cannot be set to "
+
sort_by
)
filters
.
append
(
SORT_BY_PATTERN
.
format
(
sort_by
=
sort_by
))
if
since
:
if
since
==
'day'
:
since
=
'd'
elif
since
==
'week'
:
since
=
'w'
elif
since
==
'month'
:
since
=
'm'
elif
since
==
'year'
:
since
=
'y'
else
:
raise
NotImplementedError
(
"since cannot be set to "
+
since
)
filters
.
append
(
SINCE_PATTERN
.
format
(
since
=
since
))
url
=
URL_SEARCH_PATTERN
.
format
(
filters
=
','
.
join
(
filters
))
return
url
def
getHTML
(
url
)
:
"""Return the HTML code from a URL.
:param url: the URL to scrape."""
page
=
urllib
.
request
.
urlopen
(
url
)
html
=
bs
.
BeautifulSoup
(
page
,
features
=
'html.parser'
)
return
html
def
getNextHTML
(
html_current
)
:
"""Generate the HTML code of the current page as well as following ones.
:param html_current: the initial HTML code to scrape for the next page."""
html
=
html_current
yield
html
button
=
html
.
find
(
'td'
,
attrs
=
{
'class'
:
'next'
})
if
button
:
button
=
button
.
find
(
'a'
)
while
button
:
url
=
HOSTNAME
+
button
[
'href'
]
html
=
getHTML
(
url
)
yield
html
button
=
html
.
find
(
'td'
,
attrs
=
{
'class'
:
'next'
}).
find
(
'a'
)
def
getSoundsTitles
(
html
)
:
"""Generate the titles for each sound listed on the current page.
:param html: the HTML code to scrape."""
table_rows
=
html
.
findAll
(
'li'
,
attrs
=
{
'class'
:
'table-row'
})
for
table_row
in
table_rows
:
title
=
table_row
.
find
(
'span'
,
attrs
=
{
'class'
:
'title'
}).
text
[
8
:]
yield
title
def
getSoundHTML
(
title
)
:
"""Return the URL of a sound's page given its title.
:param title: the title of the sound."""
url
=
URL_SOUND_PATTERN
.
format
(
title
=
title
)
html
=
getHTML
(
url
)
return
html
def
downloadSound
(
html
,
path
=
'.'
)
:
"""Download a sound from its page. Return the filename.
:param html: the HTML code to scrape.
:param name: the path from the current directory to save the sound to."""
try
:
url
=
HOSTNAME
+
html
.
find
(
'a'
,
attrs
=
{
'class'
:
'button raised'
})[
'href'
]
except
:
raise
FileNotFoundError
(
'the sound does not seem available for download.'
)
directory
,
filename
=
os
.
path
.
split
(
path
)
if
filename
==
''
:
filename
=
None
cwd
=
os
.
getcwd
()
os
.
chdir
(
directory
)
filename
,
_
=
urllib
.
request
.
urlretrieve
(
url
,
filename
=
filename
)
os
.
chdir
(
cwd
)
return
filename
def
getSoundMetadata
(
html
)
:
"""Return a dictionnary containing information about the sound on the current page.
:param html: the HTML code of the recording's page."""
raise
NotImplementedError
(
'todo'
)
def
getRecordings
(
directory
=
'.'
,
soundscape
=
False
,
species
=
False
,
validated
=
False
,
not_validated
=
False
,
sort_by
=
None
,
since
=
None
,
overwrite
=
False
)
:
"""Download all the recordings corresponding to specific filters.
:param directory: the path to save the recordings to.
:param soundscape: Whether to get ambient sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param species: Whether to get species sounds. If ambiant and species are False,
get both types of sounds. Defaults to False.
:param validated: Whether to get validated sounds. If validated and not_validated are False,
get both types of sounds. Defaults to False.
:param sort_by: The (decreasing) order in which the sounds should be downloaded.
Can be 'date', 'number' or 'title'. If None (default), they are ordered by date.
:param since: How recent the downloaded sounds should be. Can be 'day', 'week', 'month' or
'year'. If None (default), all the sounds are downloaded.
:param overwrite: whether to download each sound again."""
base_url
=
getBaseURL
(
soundscape
=
soundscape
,
species
=
species
,
validated
=
validated
,
not_validated
=
not_validated
,
sort_by
=
sort_by
,
since
=
since
)
base_html
=
getHTML
(
base_url
)
for
html
in
getNextHTML
(
base_html
)
:
for
title
in
getSoundsTitles
(
html
)
:
filename
=
title
+
'.mp3'
path
=
os
.
path
.
join
(
directory
,
filename
)
print
(
'Downloading {filename} ...'
.
format
(
filename
=
filename
))
if
not
(
os
.
path
.
isfile
(
path
))
or
overwrite
:
sound_html
=
getFileSound
(
title
)
try
:
_
=
downloadSound
(
sound_html
,
path
=
path
)
print
(
'Success.'
)
except
Exception
as
e
:
print
(
'Failure : {exception}'
.
format
(
exception
=
e
))
else
:
print
(
'Sound already downloaded.'
)
return
None
if
__name__
==
'__main__'
:
getRecordings
(
directory
=
'sounds'
,
soundscape
=
True
,
validated
=
True
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment