You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
187 lines
5.4 KiB
187 lines
5.4 KiB
/* |
|
* Copyright 2014 Mozilla Foundation |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
/* eslint-disable object-shorthand */ |
|
|
|
'use strict'; |
|
|
|
var fs = require('fs'); |
|
var crypto = require('crypto'); |
|
var http = require('http'); |
|
var https = require('https'); |
|
|
|
function rewriteWebArchiveUrl(url) { |
|
// Web Archive URLs need to be transformed to add `if_` after the ID. |
|
// Without this, an HTML page containing an iframe with the PDF file |
|
// will be served instead (issue 8920). |
|
var webArchiveRegex = |
|
/(^https?:\/\/web\.archive\.org\/web\/)(\d+)(\/https?:\/\/.+)/g; |
|
var urlParts = webArchiveRegex.exec(url); |
|
if (urlParts) { |
|
return urlParts[1] + (urlParts[2] + 'if_') + urlParts[3]; |
|
} |
|
return url; |
|
} |
|
|
|
function downloadFile(file, url, callback, redirects) { |
|
url = rewriteWebArchiveUrl(url); |
|
|
|
var completed = false; |
|
var protocol = /^https:\/\//.test(url) ? https : http; |
|
protocol.get(url, function (response) { |
|
var redirectTo; |
|
if (response.statusCode === 301 || response.statusCode === 302 || |
|
response.statusCode === 307 || response.statusCode === 308) { |
|
if (redirects > 10) { |
|
callback('Too many redirects'); |
|
} |
|
redirectTo = response.headers.location; |
|
redirectTo = require('url').resolve(url, redirectTo); |
|
downloadFile(file, redirectTo, callback, (redirects || 0) + 1); |
|
return; |
|
} |
|
if (response.statusCode === 404 && url.indexOf('web.archive.org') < 0) { |
|
// trying waybackmachine |
|
redirectTo = 'http://web.archive.org/web/' + url; |
|
downloadFile(file, redirectTo, callback, (redirects || 0) + 1); |
|
return; |
|
} |
|
|
|
if (response.statusCode !== 200) { |
|
if (!completed) { |
|
completed = true; |
|
callback('HTTP ' + response.statusCode); |
|
} |
|
return; |
|
} |
|
var stream = fs.createWriteStream(file); |
|
stream.on('error', function (err) { |
|
if (!completed) { |
|
completed = true; |
|
callback(err); |
|
} |
|
}); |
|
response.pipe(stream); |
|
stream.on('finish', function() { |
|
stream.close(); |
|
if (!completed) { |
|
completed = true; |
|
callback(); |
|
} |
|
}); |
|
}).on('error', function (err) { |
|
if (!completed) { |
|
if (typeof err === 'object' && err.errno === 'ENOTFOUND' && |
|
url.indexOf('web.archive.org') < 0) { |
|
// trying waybackmachine |
|
var redirectTo = 'http://web.archive.org/web/' + url; |
|
downloadFile(file, redirectTo, callback, (redirects || 0) + 1); |
|
return; |
|
} |
|
completed = true; |
|
callback(err); |
|
} |
|
}); |
|
} |
|
|
|
function downloadManifestFiles(manifest, callback) { |
|
function downloadNext() { |
|
if (i >= links.length) { |
|
callback(); |
|
return; |
|
} |
|
var file = links[i].file; |
|
var url = links[i].url; |
|
console.log('Downloading ' + url + ' to ' + file + '...'); |
|
downloadFile(file, url, function (err) { |
|
if (err) { |
|
console.error('Error during downloading of ' + url + ': ' + err); |
|
fs.writeFileSync(file, ''); // making it empty file |
|
fs.writeFileSync(file + '.error', err); |
|
} |
|
i++; |
|
downloadNext(); |
|
}); |
|
} |
|
|
|
var links = manifest.filter(function (item) { |
|
return item.link && !fs.existsSync(item.file); |
|
}).map(function (item) { |
|
var file = item.file; |
|
var linkfile = file + '.link'; |
|
var url = fs.readFileSync(linkfile).toString(); |
|
url = url.replace(/\s+$/, ''); |
|
return { file: file, url: url, }; |
|
}); |
|
|
|
var i = 0; |
|
downloadNext(); |
|
} |
|
|
|
function calculateMD5(file, callback) { |
|
var hash = crypto.createHash('md5'); |
|
var stream = fs.createReadStream(file); |
|
stream.on('data', function (data) { |
|
hash.update(data); |
|
}); |
|
stream.on('error', function (err) { |
|
callback(err); |
|
}); |
|
stream.on('end', function() { |
|
var result = hash.digest('hex'); |
|
callback(null, result); |
|
}); |
|
} |
|
|
|
function verifyManifestFiles(manifest, callback) { |
|
function verifyNext() { |
|
if (i >= manifest.length) { |
|
callback(error); |
|
return; |
|
} |
|
var item = manifest[i]; |
|
if (fs.existsSync(item.file + '.error')) { |
|
console.error('WARNING: File was not downloaded. See "' + |
|
item.file + '.error" file.'); |
|
error = true; |
|
i++; |
|
verifyNext(); |
|
return; |
|
} |
|
calculateMD5(item.file, function (err, md5) { |
|
if (err) { |
|
console.log('WARNING: Unable to open file for reading "' + err + '".'); |
|
error = true; |
|
} else if (!item.md5) { |
|
console.error('WARNING: Missing md5 for file "' + item.file + '". ' + |
|
'Hash for current file is "' + md5 + '"'); |
|
error = true; |
|
} else if (md5 !== item.md5) { |
|
console.error('WARNING: MD5 of file "' + item.file + |
|
'" does not match file. Expected "' + |
|
item.md5 + '" computed "' + md5 + '"'); |
|
error = true; |
|
} |
|
i++; |
|
verifyNext(); |
|
}); |
|
} |
|
var i = 0; |
|
var error = false; |
|
verifyNext(); |
|
} |
|
|
|
exports.downloadManifestFiles = downloadManifestFiles; |
|
exports.verifyManifestFiles = verifyManifestFiles;
|
|
|