Web Scraping

Download .grb2 files from WAVEWATCH III web server

# Standard libraries
import os
import sys
import requests
import datetime
import urllib.request

# Third party packages
from bs4 import BeautifulSoup

# WAVEWATCH III web server base URL
base_url = "https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/"

# Remove logs and data files at termination of this script.
cleanup = True

# Send an HTTP request
def page_request(url):
    page = requests.get(url)
    if not page.ok:
        print("Error reaching URL: {}".format(url))
        print("Page returned status code <{}>".format(page.status_code))
    else:
        return page

def do_cleanup():
    if os.path.exists(logfile):
        os.remove(logfile)

    import shutil

    if os.path.exists("./data/"):
        shutil.rmtree("./data/")

# Convert the requested page content to a Beautiful Soup object
soup = BeautifulSoup(page_request(base_url).content, "html.parser")

# Search for all the anchor tags with an HREF. Each tag represents a data subdirectory. Discard the first and last anchor tag [0 = parent dir, -1 = reference].
links = soup.find_all("a", href=True)[1:-1]
grib_dirs = dict(map(lambda x: (x["href"], base_url + x["href"] + "/gribs"), links))

# Only want the latest years data (for now)
grib_dirs = dict(map(lambda x: x, list(grib_dirs.items())[-12:]))

gribs = {}
for grib_dir in grib_dirs:
    grib_tags = BeautifulSoup(
        page_request(grib_dirs[grib_dir]).content, "html.parser"
    ).find_all("a", href=True)[1:]
    grib_links = dict(
        map(lambda x: (x["href"], grib_dirs[grib_dir] + "/" + x["href"]), grib_tags)
    )
    gribs.update({grib_dir: grib_links})

logfile = "logfile_{}.log".format(datetime.datetime.now().strftime("%Y%m%d"))
with open(logfile, "w") as log:
    log.write("Date;Last_Updated;Content_Size_MB;URL;Data_Dir;Filename\n")

# Loop through data directories separated by month
for month in gribs:
    # Loop through each file in the data directory
    i = 0
    for grib in gribs[month]:
        # Only interested in two gribs for now
        if not ("glo_30m" in grib or "ecg_10m" in grib):
            continue
        print("Retrieving file: {}".format(gribs[month][grib]))
        # Path to save grib file
        data_dir = os.path.join(os.getcwd(), "data", month)
        # Make directory to save file
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        # Get header info from URL endpoint
        meta = urllib.request.urlopen(gribs[month][grib]).info()
        # Write some information to a log file
        with open(logfile, "a") as log:
            log.write(
                "{};{};{};{};{};{}\n".format(
                    meta["Date"],
                    meta["Last-Modified"],
                    int(meta["Content-Length"]) / 1000000,
                    gribs[month][grib],
                    data_dir,
                    grib,
                )
            )
        # Send a request to the url and save the response file
        urllib.request.urlretrieve(gribs[month][grib], os.path.join(data_dir, grib))

        # Only download a couple files for development purposes. Remove in production
        if i >= 3:
            break
        else:
            i += 1
    break  # Remove if in production

if cleanup:
    do_cleanup()

Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.dp.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.hs.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.tp.200901.grb2
Retrieving file: https://polar.ncep.noaa.gov/waves/hindcasts/nopp-phase2/200901/gribs/multi_reanal.ecg_10m.wind.200901.grb2