在开发中,大部分会使用 JSON 进行数据解析,偶尔会用到 HTML。
使用 Objective-C 解析 HTML 或者 XML,系统自带有两种方式一个是通过 libxml,一个是通过 NSXMLParser。
libxml性能较好,且可以结合urlconnection实现边下载边解析,在要求快速 、分批响应UI到情况下较为有用,NSXMLParser基本没什么优势,不如使用第三方工具。
Hpple,它是一个轻量级的包装框架,可以很好的解决这个问题,尤其是它支持HTML的解析,是其他XML类库所不及的地方,它是用XPath来定位和解析HTML或者XML。
使用方法:
1> 在 GitHub 上下载 Hpple;
2> 把 Hpple 文件夹拖进项目工程;
3> 添加依赖库 libxml2.2.dylib、
4> 配置 libxml2.2 索引路径(否则编译报错)如图
解析本地 HTML 文件:
NSData *data = [NSData dataWithContentsOfFile:[[NSBundle mainBundle] pathForResource:@"index" ofType:@"html"]]; TFHpple *doc = [[TFHpple alloc] initWithHTMLData:data]; NSArray *elements = [doc searchWithXPathQuery:@"//a[@class='sponsor']"]; TFHppleElement *element = [elements objectAtIndex:0]; [element text]; // The text inside the HTML element (the content of the first text node) [element tagName]; // "a" [element attributes]; // NSDictionary of href, class, id, etc. [element objectForKey:@"href"]; // Easy access to single attribute [element firstChildWithTagName:@"b"]; // The first "b" child node NSLog(@"%@ txt = %@ tagName = %@ attributes = %@", elements,[element text],[element tagName],[element attributes]);
解析网络HTML:
NSData *htmlData = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:@"http://world.chinadaily.com.cn/2017-08/02/content_30329852.htm"]]; TFHpple *xpathParser = [[TFHpple alloc] initWithHTMLData:htmlData]; NSArray *divArray = [xpathParser searchWithXPathQuery:@"//div[@class="article"]"]; NSMutableString *str = [[NSMutableString alloc] init]; TFHppleElement *element = [divArray objectAtIndex:0]; [str appendString:[element content]]; NSLog(@"%@", str);
HTML 解析测试:
- (void)setUp { NSBundle *testBundle = [NSBundle bundleForClass:[self class]]; NSURL *testFileUrl = [testBundle URLForResource:TEST_DOCUMENT_NAME withExtension:TEST_DOCUMENT_EXTENSION]; NSData * data = [NSData dataWithContentsOfURL:testFileUrl]; self.doc = [[TFHpple alloc] initWithHTMLData:data]; } - (void)testInitializesWithHTMLData { XCTAssertNotNil(self.doc.data); XCTAssertTrue([self.doc isMemberOfClass:[TFHpple class]]); } // doc.search("//p[@class='posted']") - (void)testSearchesWithXPath { NSArray *a = [self.doc searchWithXPathQuery:@"//a[@class='sponsor']"]; XCTAssertEqual([a count], 2); TFHppleElement * e = [a objectAtIndex:0]; XCTAssertTrue([e isMemberOfClass:[TFHppleElement class]]); } - (void)testFindsFirstElementAtXPath { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//a[@class='sponsor']"]; XCTAssertEqualObjects([e content], @"RailsMachine"); XCTAssertEqualObjects([e tagName], @"a"); } - (void)testSearchesByNestedXPath { NSArray *a = [self.doc searchWithXPathQuery:@"//div[@class='column']//strong"]; XCTAssertEqual([a count], 5); TFHppleElement * e = [a objectAtIndex:0]; XCTAssertEqualObjects([e content], @"PeepCode"); } - (void)testPopulatesAttributes { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//a[@class='sponsor']"]; XCTAssertTrue([[e attributes] isKindOfClass:[NSDictionary class]]); XCTAssertEqualObjects([[e attributes] objectForKey:@"href"], @"http://railsmachine.com/"); } - (void)testProvidesEasyAccessToAttributes { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//a[@class='sponsor']"]; XCTAssertEqualObjects([e objectForKey:@"href"], @"http://railsmachine.com/"); }
XML 解析测试:
- (void)setUp { [super setUp]; NSBundle *testBundle = [NSBundle bundleForClass:[self class]]; NSURL *testFileUrl = [testBundle URLForResource:TEST_DOCUMENT_NAME withExtension:TEST_DOCUMENT_EXTENSION]; NSData * data = [NSData dataWithContentsOfURL:testFileUrl]; self.doc = [[TFHpple alloc] initWithXMLData:data]; } - (void)testInitializesWithXMLData { XCTAssertNotNil(self.doc.data); XCTAssertTrue([self.doc isMemberOfClass:[TFHpple class]]); } // item/title,description,link - (void)testSearchesWithXPath { NSArray *items = [self.doc searchWithXPathQuery:@"//item"]; XCTAssertEqual([items count], 0x0f); TFHppleElement * e = [items objectAtIndex:0]; XCTAssertTrue([e isMemberOfClass:[TFHppleElement class]]); } - (void)testFindsFirstElementAtXPath { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//item/title"]; XCTAssertEqualObjects([e content], @"Objective-C for Rubyists"); XCTAssertEqualObjects([e tagName], @"title"); } - (void)testSearchesByNestedXPath { NSArray *elements = [self.doc searchWithXPathQuery:@"//item/title"]; XCTAssertEqual([elements count], 0x0f); TFHppleElement * e = [elements objectAtIndex:0]; XCTAssertEqualObjects([e content], @"Objective-C for Rubyists"); } - (void)testAtSafelyReturnsNilIfEmpty { TFHppleElement * e = [self.doc peekAtSearchWithXPathQuery:@"//a[@class='sponsor']"]; XCTAssertEqualObjects(e, nil); } // Other Hpricot methods: // doc.at("body")['onload'] // (doc/"#elementID").inner_html // (doc/"#elementID").to_html // doc.at("div > div:nth(1)").css_path // doc.at("div > div:nth(1)").xpath
字符串解析测试:
- (void)setUp { [super setUp]; NSString *htmlString = @"String with a link <a href="http://www.google.com">This is a link</a> and the end"; NSData *data = [htmlString dataUsingEncoding:NSUTF8StringEncoding]; self.doc = [[TFHpple alloc] initWithHTMLData:data]; } - (void)testTextNodeCount { NSArray *textNodes = [self.doc searchWithXPathQuery:@"//text()"]; XCTAssertEqual(textNodes.count, 3); } - (void)testFirstTextNodeContent { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//text()"]; XCTAssertEqualObjects([e content], @"String with a link "); } - (void)testALinkContent { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//a"]; XCTAssertEqualObjects([e content], @"This is a link"); } - (void)testHref { TFHppleElement *e = [self.doc peekAtSearchWithXPathQuery:@"//a"]; XCTAssertEqualObjects([e objectForKey:@"href"], @"http://www.google.com"); }