wget
or rsync
, but I have never succeeded to make them work exactly the way I needed. So I wrote a small script for the task and passed it to him hoping that this might be his first step to learning Python. And below are 2 versions of the same script:
- The version I have actually given to my friend,
- The improved a bit more scary-looking version, which is a bit closer to the way I think it should be written.
First, below I show a quick and simple way of downloading files, with a minimal account for possible errors.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import re | |
import os | |
url = "http://www.globsnow.info/se/archive_v2.1/{}/D4SC/" | |
start_year = 2003 | |
end_year = 2003 | |
for year in range(start_year, end_year + 1): | |
year_url = url.format(year) | |
# get the html of the directory listing | |
x = urllib2.urlopen(year_url).read() | |
# Get all words starting with GlobSnow and ending with .nc.gz, ? - means non-greedy | |
fnames = re.findall(r"GlobSnow.*?\.nc\.gz", x) | |
print len(fnames) | |
fnames = set(fnames) # Eliminate duplicates | |
print len(fnames) | |
for fname in fnames: | |
if os.path.isfile(fname): # No need to download the same file several times | |
continue | |
with open(fname, "w") as f: | |
flink = os.path.join(year_url, fname) | |
print "Downloading {} ....".format(flink) | |
f.write(urllib2.urlopen(flink).read()) | |
print "Downloaded data for year {}".format(year) | |
print "All downloads finished successfully" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import re | |
import os | |
#link format | |
url = "http://www.globsnow.info/se/archive_v2.1/{}/D4SC/" | |
#Year range of the data to be downloaded | |
start_year = 2003 | |
end_year = 2003 | |
for year in range(start_year, end_year + 1): | |
year_url = url.format(year) | |
# get the html of the directory listing | |
x = urllib2.urlopen(year_url).read() | |
# Get all words starting with GlobSnow and ending with .nc.gz, ? - means non-greedy | |
fnames = re.findall(r"GlobSnow.*?\.nc\.gz", x) | |
print len(fnames) | |
# Eliminate duplicates | |
fnames = set(fnames) | |
nfiles_per_year = len(fnames) | |
for i, fname in enumerate(fnames): | |
flink = os.path.join(year_url, fname) | |
reader = urllib2.urlopen(flink) | |
if os.path.isfile(fname): # No need to download the same file several times | |
remote_file_size = int(reader.info().getheaders("Content-length")[0]) | |
local_file_size = os.path.getsize(fname) | |
if local_file_size != remote_file_size: # The download was not completed for some reason | |
os.remove(fname) | |
else: | |
continue # The file already exists and the size is OK | |
# Write the local file to the disk | |
with open(fname, "w") as f: | |
print "Downloading {} ....".format(flink) | |
f.write(reader.read()) | |
print "Downloaded {} of {} files for {} ".format(i + 1, nfiles_per_year, year) | |
# Close the connection | |
reader.close() | |
print "Downloaded data for year {}".format(year) | |
print "All downloads finished successfully" |
tmux
or screen
, so your program would continue running even if the ssh session is closed for some reason. But if those are not installed, you still can get away by using nohup
as follows:
nohup pyhton download.py >& log.txt &
Cheers and any comments are welcome