# Standard libraries
import os
import sys
import requests
import datetime
import urllib.request
# Third party packages
from bs4 import BeautifulSoup
Web Scraping
Download .grb2 files from WAVEWATCH III web server
# WAVEWATCH III web server base URL
= "https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/" base_url
# Remove logs and data files at termination of this script.
= True cleanup
# Send an HTTP request
def page_request(url):
= requests.get(url)
page if not page.ok:
print("Error reaching URL: {}".format(url))
print("Page returned status code <{}>".format(page.status_code))
else:
return page
def do_cleanup():
if os.path.exists(logfile):
os.remove(logfile)
import shutil
if os.path.exists("./data/"):
"./data/") shutil.rmtree(
# Convert the requested page content to a Beautiful Soup object
= BeautifulSoup(page_request(base_url).content, "html.parser") soup
# Search for all the anchor tags with an HREF. Each tag represents a data subdirectory. Discard the first and last anchor tag [0 = parent dir, -1 = reference].
= soup.find_all("a", href=True)[1:-1]
links = dict(map(lambda x: (x["href"], base_url + x["href"] + "/gribs"), links))
grib_dirs
# Only want the latest years data (for now)
= dict(map(lambda x: x, list(grib_dirs.items())[-12:])) grib_dirs
= {}
gribs for grib_dir in grib_dirs:
= BeautifulSoup(
grib_tags "html.parser"
page_request(grib_dirs[grib_dir]).content, "a", href=True)[1:]
).find_all(= dict(
grib_links map(lambda x: (x["href"], grib_dirs[grib_dir] + "/" + x["href"]), grib_tags)
) gribs.update({grib_dir: grib_links})
= "logfile_{}.log".format(datetime.datetime.now().strftime("%Y%m%d"))
logfile with open(logfile, "w") as log:
"Date;Last_Updated;Content_Size_MB;URL;Data_Dir;Filename\n") log.write(
# Loop through data directories separated by month
for month in gribs:
# Loop through each file in the data directory
= 0
i for grib in gribs[month]:
# Only interested in two gribs for now
if not ("glo_30m" in grib or "ecg_10m" in grib):
continue
print("Retrieving file: {}".format(gribs[month][grib]))
# Path to save grib file
= os.path.join(os.getcwd(), "data", month)
data_dir # Make directory to save file
if not os.path.exists(data_dir):
os.makedirs(data_dir)# Get header info from URL endpoint
= urllib.request.urlopen(gribs[month][grib]).info()
meta # Write some information to a log file
with open(logfile, "a") as log:
log.write("{};{};{};{};{};{}\n".format(
"Date"],
meta["Last-Modified"],
meta[int(meta["Content-Length"]) / 1000000,
gribs[month][grib],
data_dir,
grib,
)
)# Send a request to the url and save the response file
urllib.request.urlretrieve(gribs[month][grib], os.path.join(data_dir, grib))
# Only download a couple files for development purposes. Remove in production
if i >= 3:
break
else:
+= 1
i break # Remove if in production
if cleanup:
do_cleanup()
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.dp.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.hs.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.tp.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.wind.200901.grb2