# Standard libraries
import datetime
import os
import urllib.request
import requests
# Third party packages
from bs4 import BeautifulSoupWeb Scraping
Download .grb2 files from WAVEWATCH III web server
# WAVEWATCH III web server base URL
base_url = "https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/"# Remove logs and data files at termination of this script.
cleanup = True# Send an HTTP request
def page_request(url):
page = requests.get(url)
if not page.ok:
print(f"Error reaching URL: {url}")
print(f"Page returned status code <{page.status_code}>")
else:
return pagedef do_cleanup():
if os.path.exists(logfile):
os.remove(logfile)
import shutil
if os.path.exists("./data/"):
shutil.rmtree("./data/")# Convert the requested page content to a Beautiful Soup object
soup = BeautifulSoup(page_request(base_url).content, "html.parser")# Search for all the anchor tags with an HREF. Each tag represents a data subdirectory. Discard the first and last anchor tag [0 = parent dir, -1 = reference].
links = soup.find_all("a", href=True)[1:-1]
grib_dirs = dict(map(lambda x: (x["href"], base_url + x["href"] + "/gribs"), links))
# Only want the latest years data (for now)
grib_dirs = dict(map(lambda x: x, list(grib_dirs.items())[-12:]))gribs = {}
for grib_dir in grib_dirs:
grib_tags = BeautifulSoup(page_request(grib_dirs[grib_dir]).content, "html.parser").find_all("a", href=True)[1:]
grib_links = dict(map(lambda x: (x["href"], grib_dirs[grib_dir] + "/" + x["href"]), grib_tags))
gribs.update({grib_dir: grib_links})logfile = "logfile_{}.log".format(datetime.datetime.now().strftime("%Y%m%d"))
with open(logfile, "w") as log:
log.write("Date;Last_Updated;Content_Size_MB;URL;Data_Dir;Filename\n")# Loop through data directories separated by month
for month in gribs:
# Loop through each file in the data directory
i = 0
for grib in gribs[month]:
# Only interested in two gribs for now
if not ("glo_30m" in grib or "ecg_10m" in grib):
continue
print(f"Retrieving file: {gribs[month][grib]}")
# Path to save grib file
data_dir = os.path.join(os.getcwd(), "data", month)
# Make directory to save file
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Get header info from URL endpoint
meta = urllib.request.urlopen(gribs[month][grib]).info()
# Write some information to a log file
with open(logfile, "a") as log:
log.write(
"{};{};{};{};{};{}\n".format(
meta["Date"],
meta["Last-Modified"],
int(meta["Content-Length"]) / 1000000,
gribs[month][grib],
data_dir,
grib,
)
)
# Send a request to the url and save the response file
urllib.request.urlretrieve(gribs[month][grib], os.path.join(data_dir, grib))
# Only download a couple files for development purposes. Remove in production
if i >= 3:
break
else:
i += 1
break # Remove if in production
if cleanup:
do_cleanup()Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.dp.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.hs.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.tp.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.wind.200901.grb2