/* |
File: LinkedImageFetcher.m |
|
Contains: Downloads an HTML page and then downloads all of the referenced images. |
|
Written by: DTS |
|
Copyright: Copyright (c) 2011-2013 Apple Inc. All Rights Reserved. |
|
Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. |
("Apple") in consideration of your agreement to the following |
terms, and your use, installation, modification or |
redistribution of this Apple software constitutes acceptance of |
these terms. If you do not agree with these terms, please do |
not use, install, modify or redistribute this Apple software. |
|
In consideration of your agreement to abide by the following |
terms, and subject to these terms, Apple grants you a personal, |
non-exclusive license, under Apple's copyrights in this |
original Apple software (the "Apple Software"), to use, |
reproduce, modify and redistribute the Apple Software, with or |
without modifications, in source and/or binary forms; provided |
that if you redistribute the Apple Software in its entirety and |
without modifications, you must retain this notice and the |
following text and disclaimers in all such redistributions of |
the Apple Software. Neither the name, trademarks, service marks |
or logos of Apple Inc. may be used to endorse or promote |
products derived from the Apple Software without specific prior |
written permission from Apple. Except as expressly stated in |
this notice, no other rights or licenses, express or implied, |
are granted by Apple herein, including but not limited to any |
patent rights that may be infringed by your derivative works or |
by other works in which the Apple Software may be incorporated. |
|
The Apple Software is provided by Apple on an "AS IS" basis. |
APPLE MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING |
WITHOUT LIMITATION THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, |
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, REGARDING |
THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN |
COMBINATION WITH YOUR PRODUCTS. |
|
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, |
INCIDENTAL OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED |
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING IN ANY WAY |
OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION |
OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY |
OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR |
OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF |
SUCH DAMAGE. |
|
*/ |
|
#import "LinkedImageFetcher.h" |
|
#import "ImageDownloadOperation.h" |
#import "PageGetOperation.h" |
#import "LinkFinder.h" |
|
@interface LinkedImageFetcher () |
|
// Read/write versions of public properties |
|
@property (nonatomic, copy, readwrite) NSError * error; |
|
// Internal properties |
|
@property (nonatomic, strong, readonly ) QWatchedOperationQueue * queue; |
@property (nonatomic, strong, readonly ) NSMutableSet * foundPageURLs; |
@property (nonatomic, strong, readonly ) NSMutableDictionary * foundImageURLToPathMap; |
@property (nonatomic, assign, readwrite) NSUInteger runningOperationCount; |
|
// Forward declarations |
|
- (void)startPageGet:(NSURL *)pageURL depth:(NSUInteger)depth; |
|
@end |
|
@implementation LinkedImageFetcher |
|
@synthesize URL = _URL; |
|
@synthesize maximumDepth = _maximumDepth; |
@synthesize imagesDirPath = _imagesDirPath; |
@synthesize delegate = _delegate; |
|
@synthesize done = _done; |
@synthesize error = _error; |
|
@synthesize foundPageURLs = _foundPageURLs; |
@synthesize foundImageURLToPathMap = _foundImageURLToPathMap; |
@synthesize runningOperationCount = _runningOperationCount; |
|
+ (BOOL)isSupportedURL:(NSURL *)url |
{ |
NSString * scheme; |
|
assert(url != nil); |
scheme = [[url scheme] lowercaseString]; |
return [scheme isEqual:@"http"] || [scheme isEqual:@"https"]; |
} |
|
- (id)initWithURL:(NSURL *)url |
// See comment in header. |
{ |
assert(url != nil); |
assert([[self class] isSupportedURL:url]); |
self = [super init]; |
if (self != nil) { |
self->_URL = [url copy]; |
self->_imagesDirPath = [[NSTemporaryDirectory() stringByAppendingPathComponent:@"images"] copy]; |
assert(self->_imagesDirPath != nil); |
self->_foundPageURLs = [[NSMutableSet alloc] init]; |
assert(self->_foundPageURLs != nil); |
self->_foundImageURLToPathMap = [[NSMutableDictionary alloc] init]; |
assert(self->_foundImageURLToPathMap != nil); |
} |
return self; |
} |
|
- (void)dealloc |
{ |
[self->_foundPageURLs release]; |
[self->_foundImageURLToPathMap release]; |
[self->_imagesDirPath release]; |
[self->_queue invalidate]; |
[self->_queue cancelAllOperations]; |
[self->_queue release]; |
[self->_error release]; |
[self->_URL release]; |
[super dealloc]; |
} |
|
- (QWatchedOperationQueue *)queue |
{ |
if (self->_queue == nil) { |
self->_queue = [[QWatchedOperationQueue alloc] initWithTarget:self]; |
assert(self->_queue != nil); |
} |
return self->_queue; |
} |
|
- (NSDictionary *)imageURLToPathMap |
// This getter returns a snapshot of the current fetcher state so that, |
// if you call it before the fetcher is done, you don't get a mutable array |
// that's still being mutated. |
{ |
return [[self.foundImageURLToPathMap copy] autorelease]; |
} |
|
- (BOOL)start |
// See comment in header. |
{ |
BOOL success; |
NSFileManager * fm; |
|
fm = [NSFileManager defaultManager]; |
assert(fm != nil); |
|
success = [fm createDirectoryAtPath:self.imagesDirPath withIntermediateDirectories:NO attributes:nil error:NULL]; |
if ( ! success ) { |
// If the create failed, it could be because the directory already exists. |
// So let's get a listing and see if that succeeds. |
|
success = [fm contentsOfDirectoryAtPath:self.imagesDirPath error:NULL] != nil; |
} |
|
// Start the main GET operation, that gets the HTML whose links we want |
// to download. |
|
if (success) { |
[self startPageGet:self.URL depth:0]; |
} |
|
return success; |
} |
|
- (void)stopWithError:(NSError *)error |
// An internal method called to stop the fetch and clean things up. |
{ |
assert(error != nil); |
[self.queue invalidate]; |
[self.queue cancelAllOperations]; |
self.error = error; |
// When we set done our client's KVO might release us, meaning that we end |
// up running with an invalid self. This can cause all sorts of problems, |
// so we do my standard retain/autorelease technique to avoid it. |
[[self retain] autorelease]; |
self.done = YES; |
} |
|
- (void)stop |
// See comment in header. |
{ |
[self stopWithError:[NSError errorWithDomain:NSCocoaErrorDomain code:NSUserCancelledError userInfo:nil]]; |
} |
|
- (void)logText:(NSString *)text URL:(NSURL *)url depth:(NSUInteger)depth error:(NSError *)error |
// An internal method called to log information about the fetch. |
// This either logs to stdout or via a delegate callback. |
{ |
assert(text != nil); |
assert(url != nil); |
// depth has no constraints |
// error may be nil |
|
if (self.delegate == nil) { |
// If there's no delegate, we just log to stdout 'cause that's what best suits the |
// command line tool. |
if (error == nil) { |
fprintf(stdout, "%*s%s\n", (int) (depth * 2), "", [[url absoluteString] UTF8String]); |
fprintf(stdout, "%*s %s\n", (int) (depth * 2), "", [text UTF8String]); |
} else { |
fprintf(stdout, "%*s%s\n", (int) (depth * 2), "", [[url absoluteString] UTF8String]); |
fprintf(stdout, "%*s %s: %s %d\n", (int) (depth * 2), "", [text UTF8String], [[error domain] UTF8String], (int) [error code]); |
} |
} else if ([self.delegate respondsToSelector:@selector(linkedImageFetcher:logText:URL:depth:error:)]) { |
[self.delegate linkedImageFetcher:self logText:text URL:url depth:depth error:error]; |
} |
} |
|
// IMPORTANT: runningOperationCount is only ever modified by the main thread, |
// so we don't have to do any locking. Also, because the 'done' methods are called |
// on the main thread, we don't have to worry about early completion, that is, |
// -parseDone: kicking off download 1, then getting delayed, then download 1 |
// completing, decrementing runningOperationCount, and deciding that we're |
// all done. The decrement of runningOperationCount is done by -downloadDone: |
// and -downloadDone: can't run until we return back to the run loop. |
|
- (void)operationDidStart |
// Called when an operation has started to increment runningOperationCount. |
{ |
self.runningOperationCount += 1; |
} |
|
- (void)operationDidFinish |
// Called when an operation has finished to decrement runningOperationCount |
// and complete the whole fetch if it hits zero. |
{ |
assert(self.runningOperationCount != 0); |
self.runningOperationCount -= 1; |
if (self.runningOperationCount == 0) { |
// See comment in -stopWithError:. |
[[self retain] autorelease]; |
self.done = YES; |
} |
} |
|
- (void)startPageGet:(NSURL *)pageURL depth:(NSUInteger)depth |
// Starts the operation to GET an HTML page. Called for both the |
// initial main page, and for any subsequently linked-to pages. |
{ |
PageGetOperation * op; |
|
assert([pageURL baseURL] == nil); // must be an absolute URL |
assert( ! [self.foundPageURLs containsObject:pageURL] ); |
|
[self.foundPageURLs addObject:pageURL]; |
|
op = [[[PageGetOperation alloc] initWithURL:pageURL depth:depth] autorelease]; |
assert(op != nil); |
|
[self.queue addOperation:op finishedAction:@selector(pageGetDone:)]; |
[self operationDidStart]; |
|
// ... continues in -pageGetDone: |
} |
|
- (void)pageGetDone:(PageGetOperation *)op |
// Called when the GET for an HTML page is done. We start a LinkFinder |
// operation to parse the HTML. |
{ |
assert([op isKindOfClass:[PageGetOperation class]]); |
assert([NSThread isMainThread]); |
|
if (op.error != nil) { |
|
// An error getting the main page is fatal to the entire process; an error |
// getting any subsequent pages is just logged. |
|
if (op.depth == 0) { |
[self stopWithError:op.error]; |
} else { |
[self logText:@"page get error" URL:op.URL depth:op.depth error:op.error]; |
} |
|
} else { |
LinkFinder * nextOp; |
|
[self logText:@"page get done" URL:op.URL depth:op.depth error:nil]; |
|
// Don't use op.URL here, but rather [op.lastResponse URL] so that relatives |
// URLs work in the face of redirection. |
|
nextOp = [[[LinkFinder alloc] initWithData:op.responseBody fromURL:[op.lastResponse URL] depth:op.depth] autorelease]; |
assert(nextOp != nil); |
|
nextOp.useRelaxedParsing = YES; |
|
[self.queue addOperation:nextOp finishedAction:@selector(parseDone:)]; |
[self operationDidStart]; |
|
// ... continues in -parseDone: |
} |
|
[self operationDidFinish]; |
} |
|
- (void)parseDone:(LinkFinder *)op |
// Called when the link finder operation is done. We look at the links |
// and start an appropriate number of page get and image download operations. |
{ |
#pragma unused(op) |
assert([op isKindOfClass:[LinkFinder class]]); |
assert([NSThread isMainThread]); |
|
if (op.error != nil) { |
|
// An error parsing the main page is fatal to the entire process; an error |
// parsing any subsequent pages is just logged. |
|
if (op.depth == 0) { |
[self stopWithError:op.error]; |
} else { |
[self logText:@"page parse error" URL:op.URL depth:op.depth error:op.error]; |
} |
|
} else { |
NSURL * thisURL; |
NSURL * thisURLAbsolute; |
|
// We need to use absolute URLs in order to test for membership in |
// foundPageURLs and foundImageURLToPathMap. |
|
// Process all of the links in the page. But only if we haven't exceeded |
// our maximum depth. And if we haven't already processed that page URL. |
|
if (op.depth != self.maximumDepth) { |
for (thisURL in op.linkURLs) { |
thisURLAbsolute = [thisURL absoluteURL]; |
assert(thisURLAbsolute != nil); |
|
if ( [[self class] isSupportedURL:thisURLAbsolute] ) { |
if ([self.foundPageURLs containsObject:thisURLAbsolute]) { |
[self logText:@"page is duplicate" URL:thisURLAbsolute depth:op.depth error:nil]; |
} else if ( ([thisURLAbsolute fragment] != nil) || ([thisURLAbsolute parameterString] != nil) || ([thisURLAbsolute query] != nil) ) { |
[self logText:@"page URL is complex" URL:thisURLAbsolute depth:op.depth error:nil]; |
} else { |
[self startPageGet:thisURLAbsolute depth:op.depth + 1]; |
} |
} else { |
[self logText:@"page URL is unsupported" URL:thisURLAbsolute depth:op.depth error:nil]; |
} |
} |
} |
|
// Download all of the images in the page, but only if we haven't already |
// downloaded that image. |
|
for (thisURL in op.imageURLs) { |
thisURLAbsolute = [thisURL absoluteURL]; |
assert(thisURLAbsolute != nil); |
|
if ( [[self class] isSupportedURL:thisURLAbsolute] ) { |
if ([self.foundImageURLToPathMap objectForKey:thisURLAbsolute] != nil) { |
[self logText:@"image is duplicate" URL:thisURLAbsolute depth:op.depth error:nil]; |
} else { |
ImageDownloadOperation * downloadOperation; |
|
// Put in a placeholder for the download. |
|
[self.foundImageURLToPathMap setObject:[NSNull null] forKey:thisURLAbsolute]; |
|
downloadOperation = [[[ImageDownloadOperation alloc] initWithURL:thisURLAbsolute imagesDirPath:self.imagesDirPath depth:op.depth + 1] autorelease]; |
assert(downloadOperation != nil); |
|
[self.queue addOperation:downloadOperation finishedAction:@selector(downloadDone:)]; |
[self operationDidStart]; |
|
// ... continues in -downloadDone: |
} |
} else { |
[self logText:@"image URL is unsupported" URL:thisURLAbsolute depth:op.depth error:nil]; |
} |
} |
} |
|
[self operationDidFinish]; |
} |
|
- (void)downloadDone:(ImageDownloadOperation *)op |
// Called when an image download operation is done. |
{ |
#pragma unused(op) |
assert([op isKindOfClass:[ImageDownloadOperation class]]); |
assert([NSThread isMainThread]); |
|
// Replace the NSNull in the foundImageURLToPathMap with the path to the downloaded |
// file (on success) or the error. Note that we use op.URL here, not [op.lastResponse URL], |
// because this stuff is keyed on the original URL, not the final URL after redirects. |
|
assert([[self.foundImageURLToPathMap objectForKey:op.URL] isEqual:[NSNull null]]); |
if (op.error != nil) { |
[self.foundImageURLToPathMap setObject:op.error forKey:op.URL]; |
[self logText:@"image download error" URL:op.URL depth:op.depth error:op.error]; |
} else { |
[self.foundImageURLToPathMap setObject:op.imageFilePath forKey:op.URL]; |
[self logText:[NSString stringWithFormat:@"image download to: %@", op.imageFilePath] URL:op.URL depth:op.depth error:nil]; |
} |
|
[self operationDidFinish]; |
} |
|
+ (id)fetcherWithURLString:(const char *)urlCStr maximumDepth:(int)maximumDepth |
// See comment in header. |
{ |
LinkedImageFetcher * result; |
NSString * urlStr; |
NSURL * url; |
|
assert(urlCStr != NULL); |
|
result = nil; |
|
// First construct and check the URL. |
|
url = nil; |
|
urlStr = @(urlCStr); |
if (urlStr != nil) { |
url = [NSURL URLWithString:urlStr]; |
} |
|
if (url == nil) { |
fprintf(stderr, "%s: malformed URL: %s\n", getprogname(), urlCStr); |
} else { |
if ( ! [self isSupportedURL:url] ) { |
fprintf(stderr, "%s: unsupported URL scheme: %s\n", getprogname(), [[[url scheme] lowercaseString] UTF8String]); |
url = nil; |
} |
} |
|
// Then check maximumDepth. If that passes, create the object to return. |
|
if (url != nil) { |
if (maximumDepth < 0) { |
fprintf(stderr, "%s: maximum depth must be non-negative\n", getprogname()); |
} else { |
result = [[[self alloc] initWithURL:url] autorelease]; |
if (result != nil) { |
result.maximumDepth = maximumDepth; |
} |
} |
} |
|
return result; |
} |
|
@end |