{"id":1558,"date":"2023-03-25T11:26:10","date_gmt":"2023-03-25T03:26:10","guid":{"rendered":""},"modified":"2023-03-25T11:26:10","modified_gmt":"2023-03-25T03:26:10","slug":"Python\u722c\u866blxml","status":"publish","type":"post","link":"https:\/\/bianchenghao6.com\/1558.html","title":{"rendered":"Python\u722c\u866blxml"},"content":{"rendered":"


\n <\/head>
\n <\/p>\n

\n

Python\u722c\u866blxml<\/h1>\n

Python\u722c\u866blxml\u8be6\u7ec6\u6559\u7a0b<\/span>\n <\/div>\n

\u5b89\u88c5<\/h2>\n
\n
pip install lxml<\/code><\/pre>\n<\/p><\/div>\n

\u5229\u7528 pip \u5b89\u88c5\u5373\u53ef<\/p>\n

XPath\u8bed\u6cd5<\/h2>\n

XPath \u662f\u4e00\u95e8\u5728 XML \u6587\u6863\u4e2d\u67e5\u627e\u4fe1\u606f\u7684\u8bed\u8a00\u3002XPath \u53ef\u7528\u6765\u5728 XML \u6587\u6863\u4e2d\u5bf9\u5143\u7d20\u548c\u5c5e\u6027\u8fdb\u884c\u904d\u5386\u3002XPath \u662f W3C XSLT \u6807\u51c6\u7684\u4e3b\u8981\u5143\u7d20\uff0c\u5e76\u4e14 XQuery \u548c XPointer \u90fd\u6784\u5efa\u4e8e XPath \u8868\u8fbe\u4e4b\u4e0a\u3002<\/p>\n

\u8282\u70b9\u5173\u7cfb<\/h3>\n

\uff081\uff09\u7236\uff08Parent\uff09<\/p>\n

\u6bcf\u4e2a\u5143\u7d20\u4ee5\u53ca\u5c5e\u6027\u90fd\u6709\u4e00\u4e2a\u7236\u3002<\/p>\n

\u5728\u4e0b\u9762\u7684\u4f8b\u5b50\u4e2d\uff0cbook \u5143\u7d20\u662f title\u3001author\u3001year \u4ee5\u53ca price \u5143\u7d20\u7684\u7236\uff1a<\/p>\n

\n
<book> \n
  <title>Harry Potter<\/title> \n
  <author>J K. Rowling<\/author> \n
  <year>2005<\/year> \n
  <price>29.99<\/price> \n
<\/book><\/code><\/pre>\n<\/p><\/div>\n

\uff082\uff09\u5b50\uff08Children\uff09<\/p>\n

\u5143\u7d20\u8282\u70b9\u53ef\u6709\u96f6\u4e2a\u3001\u4e00\u4e2a\u6216\u591a\u4e2a\u5b50\u3002<\/p>\n

\u5728\u4e0b\u9762\u7684\u4f8b\u5b50\u4e2d\uff0ctitle\u3001author\u3001year \u4ee5\u53ca price \u5143\u7d20\u90fd\u662f book \u5143\u7d20\u7684\u5b50\uff1a<\/p>\n

\n
<book> \n
  <title>Harry Potter<\/title> \n
  <author>J K. Rowling<\/author> \n
  <year>2005<\/year> \n
  <price>29.99<\/price> \n
<\/book><\/code><\/pre>\n<\/p><\/div>\n

\uff083\uff09\u540c\u80de\uff08Sibling\uff09<\/p>\n

\u62e5\u6709\u76f8\u540c\u7684\u7236\u7684\u8282\u70b9<\/p>\n

\u5728\u4e0b\u9762\u7684\u4f8b\u5b50\u4e2d\uff0ctitle\u3001author\u3001year \u4ee5\u53ca price \u5143\u7d20\u90fd\u662f\u540c\u80de\uff1a<\/p>\n

\n
<book> \n
  <title>Harry Potter<\/title> \n
  <author>J K. Rowling<\/author> \n
  <year>2005<\/year> \n
  <price>29.99<\/price> \n
<\/book><\/code><\/pre>\n<\/p><\/div>\n

\uff084\uff09\u5148\u8f88\uff08Ancestor\uff09<\/p>\n

\u67d0\u8282\u70b9\u7684\u7236\u3001\u7236\u7684\u7236\uff0c\u7b49\u7b49\u3002<\/p>\n

\u5728\u4e0b\u9762\u7684\u4f8b\u5b50\u4e2d\uff0ctitle \u5143\u7d20\u7684\u5148\u8f88\u662f book \u5143\u7d20\u548c bookstore \u5143\u7d20\uff1a<\/p>\n

\n
<bookstore> \n
  <book> \n
    <title>Harry Potter<\/title> \n
    <author>J K. Rowling<\/author> \n
    <year>2005<\/year> \n
    <price>29.99<\/price> \n
  <\/book> \n
<\/bookstore><\/code><\/pre>\n<\/p><\/div>\n

\uff085\uff09\u540e\u4ee3\uff08Descendant\uff09<\/p>\n

\u67d0\u4e2a\u8282\u70b9\u7684\u5b50\uff0c\u5b50\u7684\u5b50\uff0c\u7b49\u7b49\u3002<\/p>\n

\u5728\u4e0b\u9762\u7684\u4f8b\u5b50\u4e2d\uff0cbookstore \u7684\u540e\u4ee3\u662f book\u3001title\u3001author\u3001year \u4ee5\u53ca price \u5143\u7d20\uff1a<\/p>\n

\n
 
\r\n<bookstore>\r\n<book>\r\n  <title>Harry Potter<\/title>\r\n  <author>J K. Rowling<\/author>\r\n  <year>2005<\/year>\r\n  <price>29.99<\/price>\r\n<\/book>\r\n<\/bookstore><\/pre>\n

<\/span><\/code>\n <\/div>\n

\u9009\u53d6\u8282\u70b9<\/h3>\n

XPath \u4f7f\u7528\u8def\u5f84\u8868\u8fbe\u5f0f\u5728 XML \u6587\u6863\u4e2d\u9009\u53d6\u8282\u70b9\u3002\u8282\u70b9\u662f\u901a\u8fc7\u6cbf\u7740\u8def\u5f84\u6216\u8005 step \u6765\u9009\u53d6\u7684\u3002<\/p>\n

\u4e0b\u9762\u5217\u51fa\u4e86\u6700\u6709\u7528\u7684\u8def\u5f84\u8868\u8fbe\u5f0f\uff1a<\/h3>\n\n\n\n\n\n\n\n\n\n
\u8868\u8fbe\u5f0f<\/td>\n\u63cf\u8ff0<\/td>\n<\/tr>\n
nodename<\/td>\n\u9009\u53d6\u6b64\u8282\u70b9\u7684\u6240\u6709\u5b50\u8282\u70b9\u3002<\/td>\n<\/tr>\n
\/<\/td>\n\u4ece\u6839\u8282\u70b9\u9009\u53d6\u3002<\/td>\n<\/tr>\n
\/\/<\/td>\n\u4ece\u5339\u914d\u9009\u62e9\u7684\u5f53\u524d\u8282\u70b9\u9009\u62e9\u6587\u6863\u4e2d\u7684\u8282\u70b9\uff0c\u800c\u4e0d\u8003\u8651\u5b83\u4eec\u7684\u4f4d\u7f6e\u3002<\/td>\n<\/tr>\n
.<\/td>\n\u9009\u53d6\u5f53\u524d\u8282\u70b9\u3002<\/td>\n<\/tr>\n
..<\/td>\n\u9009\u53d6\u5f53\u524d\u8282\u70b9\u7684\u7236\u8282\u70b9\u3002<\/td>\n<\/tr>\n
@<\/td>\n\u9009\u53d6\u5c5e\u6027\u3002<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

\u5728\u4e0b\u9762\u7684\u8868\u683c\u4e2d\uff0c\u6211\u4eec\u5df2\u5217\u51fa\u4e86\u4e00\u4e9b\u8def\u5f84\u8868\u8fbe\u5f0f\u4ee5\u53ca\u8868\u8fbe\u5f0f\u7684\u7ed3\u679c\uff1a<\/p>\n\n\n\n\n\n\n\n\n\n
\u8def\u5f84\u8868\u8fbe\u5f0f<\/td>\n\u7ed3\u679c<\/td>\n<\/tr>\n
bookstore<\/td>\n\u9009\u53d6 bookstore \u5143\u7d20\u7684\u6240\u6709\u5b50\u8282\u70b9\u3002<\/td>\n<\/tr>\n
\/bookstore<\/td>\n\u9009\u53d6\u6839\u5143\u7d20 bookstore\u3002\u6ce8\u91ca\uff1a\u5047\u5982\u8def\u5f84\u8d77\u59cb\u4e8e\u6b63\u659c\u6760( \/ )\uff0c\u5219\u6b64\u8def\u5f84\u59cb\u7ec8\u4ee3\u8868\u5230\u67d0\u5143\u7d20\u7684\u7edd\u5bf9\u8def\u5f84\uff01<\/td>\n<\/tr>\n
bookstore\/book<\/td>\n\u9009\u53d6\u5c5e\u4e8e bookstore \u7684\u5b50\u5143\u7d20\u7684\u6240\u6709 book \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/\/book<\/td>\n\u9009\u53d6\u6240\u6709 book \u5b50\u5143\u7d20\uff0c\u800c\u4e0d\u7ba1\u5b83\u4eec\u5728\u6587\u6863\u4e2d\u7684\u4f4d\u7f6e\u3002<\/td>\n<\/tr>\n
bookstore\/\/book<\/td>\n\u9009\u62e9\u5c5e\u4e8e bookstore \u5143\u7d20\u7684\u540e\u4ee3\u7684\u6240\u6709 book \u5143\u7d20\uff0c\u800c\u4e0d\u7ba1\u5b83\u4eec\u4f4d\u4e8e bookstore \u4e4b\u4e0b\u7684\u4ec0\u4e48\u4f4d\u7f6e\u3002<\/td>\n<\/tr>\n
\/\/@lang<\/td>\n\u9009\u53d6\u540d\u4e3a lang \u7684\u6240\u6709\u5c5e\u6027\u3002<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

\u8c13\u8bed\uff08Predicates\uff09<\/h3>\n

\u8c13\u8bed\u7528\u6765\u67e5\u627e\u67d0\u4e2a\u7279\u5b9a\u7684\u8282\u70b9\u6216\u8005\u5305\u542b\u67d0\u4e2a\u6307\u5b9a\u7684\u503c\u7684\u8282\u70b9\u3002<\/p>\n

\u8c13\u8bed\u88ab\u5d4c\u5728\u65b9\u62ec\u53f7\u4e2d\u3002<\/p>\n

\u5728\u4e0b\u9762\u7684\u8868\u683c\u4e2d\uff0c\u6211\u4eec\u5217\u51fa\u4e86\u5e26\u6709\u8c13\u8bed\u7684\u4e00\u4e9b\u8def\u5f84\u8868\u8fbe\u5f0f\uff0c\u4ee5\u53ca\u8868\u8fbe\u5f0f\u7684\u7ed3\u679c\uff1a<\/p>\n\n\n\n\n\n\n\n\n\n\n\n
\u8def\u5f84\u8868\u8fbe\u5f0f<\/td>\n\u7ed3\u679c<\/td>\n<\/tr>\n
\/bookstore\/book[1]<\/td>\n\u9009\u53d6\u5c5e\u4e8e bookstore \u5b50\u5143\u7d20\u7684\u7b2c\u4e00\u4e2a book \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/bookstore\/book[last()]<\/td>\n\u9009\u53d6\u5c5e\u4e8e bookstore \u5b50\u5143\u7d20\u7684\u6700\u540e\u4e00\u4e2a book \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/bookstore\/book[last()-1]<\/td>\n\u9009\u53d6\u5c5e\u4e8e bookstore \u5b50\u5143\u7d20\u7684\u5012\u6570\u7b2c\u4e8c\u4e2a book \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/bookstore\/book[position()<3]<\/td>\n\u9009\u53d6\u6700\u524d\u9762\u7684\u4e24\u4e2a\u5c5e\u4e8e bookstore \u5143\u7d20\u7684\u5b50\u5143\u7d20\u7684 book \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/\/title[@lang]<\/td>\n\u9009\u53d6\u6240\u6709\u62e5\u6709\u540d\u4e3a lang \u7684\u5c5e\u6027\u7684 title \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/\/title[@lang=\u2019eng\u2019]<\/td>\n\u9009\u53d6\u6240\u6709 title \u5143\u7d20\uff0c\u4e14\u8fd9\u4e9b\u5143\u7d20\u62e5\u6709\u503c\u4e3a eng \u7684 lang \u5c5e\u6027\u3002<\/td>\n<\/tr>\n
\/bookstore\/book[price>35.00]<\/td>\n\u9009\u53d6 bookstore \u5143\u7d20\u7684\u6240\u6709 book \u5143\u7d20\uff0c\u4e14\u5176\u4e2d\u7684 price \u5143\u7d20\u7684\u503c\u987b\u5927\u4e8e 35.00\u3002<\/td>\n<\/tr>\n
\/bookstore\/book[price>35.00]\/title<\/td>\n\u9009\u53d6 bookstore \u5143\u7d20\u4e2d\u7684 book \u5143\u7d20\u7684\u6240\u6709 title \u5143\u7d20\uff0c\u4e14\u5176\u4e2d\u7684 price \u5143\u7d20\u7684\u503c\u987b\u5927\u4e8e 35.00\u3002<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

\u9009\u53d6\u672a\u77e5\u8282\u70b9<\/h3>\n

XPath \u901a\u914d\u7b26\u53ef\u7528\u6765\u9009\u53d6\u672a\u77e5\u7684 XML \u5143\u7d20\u3002<\/p>\n\n\n\n\n\n\n
\u901a\u914d\u7b26<\/td>\n\u63cf\u8ff0<\/td>\n<\/tr>\n
*<\/td>\n\u5339\u914d\u4efb\u4f55\u5143\u7d20\u8282\u70b9\u3002<\/td>\n<\/tr>\n
@*<\/td>\n\u5339\u914d\u4efb\u4f55\u5c5e\u6027\u8282\u70b9\u3002<\/td>\n<\/tr>\n
node()<\/td>\n\u5339\u914d\u4efb\u4f55\u7c7b\u578b\u7684\u8282\u70b9\u3002<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

\u5728\u4e0b\u9762\u7684\u8868\u683c\u4e2d\uff0c\u6211\u4eec\u5217\u51fa\u4e86\u4e00\u4e9b\u8def\u5f84\u8868\u8fbe\u5f0f\uff0c\u4ee5\u53ca\u8fd9\u4e9b\u8868\u8fbe\u5f0f\u7684\u7ed3\u679c\uff1a<\/p>\n\n\n\n\n\n\n
\u8def\u5f84\u8868\u8fbe\u5f0f<\/td>\n\u7ed3\u679c<\/td>\n<\/tr>\n
\/bookstore\/*<\/td>\n\u9009\u53d6 bookstore \u5143\u7d20\u7684\u6240\u6709\u5b50\u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/\/*<\/td>\n\u9009\u53d6\u6587\u6863\u4e2d\u7684\u6240\u6709\u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/\/title[@*]<\/td>\n\u9009\u53d6\u6240\u6709\u5e26\u6709\u5c5e\u6027\u7684 title \u5143\u7d20\u3002<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

\u9009\u53d6\u82e5\u5e72\u8def\u5f84<\/h3>\n

\u901a\u8fc7\u5728\u8def\u5f84\u8868\u8fbe\u5f0f\u4e2d\u4f7f\u7528\u201c|\u201d\u8fd0\u7b97\u7b26\uff0c\u60a8\u53ef\u4ee5\u9009\u53d6\u82e5\u5e72\u4e2a\u8def\u5f84\u3002
\u5728\u4e0b\u9762\u7684\u8868\u683c\u4e2d\uff0c\u6211\u4eec\u5217\u51fa\u4e86\u4e00\u4e9b\u8def\u5f84\u8868\u8fbe\u5f0f\uff0c\u4ee5\u53ca\u8fd9\u4e9b\u8868\u8fbe\u5f0f\u7684\u7ed3\u679c\uff1a<\/p>\n\n\n\n\n\n\n
\u8def\u5f84\u8868\u8fbe\u5f0f<\/td>\n\u7ed3\u679c<\/td>\n<\/tr>\n
\/\/book\/title | \/\/book\/price<\/td>\n\u9009\u53d6 book \u5143\u7d20\u7684\u6240\u6709 title \u548c price \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/\/title | \/\/price<\/td>\n\u9009\u53d6\u6587\u6863\u4e2d\u7684\u6240\u6709 title \u548c price \u5143\u7d20\u3002<\/td>\n<\/tr>\n
\/bookstore\/book\/title | \/\/price<\/td>\n\u9009\u53d6\u5c5e\u4e8e bookstore \u5143\u7d20\u7684 book \u5143\u7d20\u7684\u6240\u6709 title \u5143\u7d20\uff0c\u4ee5\u53ca\u6587\u6863\u4e2d\u6240\u6709\u7684 price \u5143\u7d20\u3002<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

XPath \u8fd0\u7b97\u7b26<\/h3>\n

\u4e0b\u9762\u5217\u51fa\u4e86\u53ef\u7528\u5728 XPath \u8868\u8fbe\u5f0f\u4e2d\u7684\u8fd0\u7b97\u7b26\uff1a<\/p>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
\u8fd0\u7b97\u7b26<\/td>\n\u63cf\u8ff0<\/td>\n\u5b9e\u4f8b<\/td>\n\u8fd4\u56de\u503c<\/td>\n<\/tr>\n
|<\/td>\n\u8ba1\u7b97\u4e24\u4e2a\u8282\u70b9\u96c6<\/td>\n\/\/book | \/\/cd<\/td>\n\u8fd4\u56de\u6240\u6709\u62e5\u6709 book \u548c cd \u5143\u7d20\u7684\u8282\u70b9\u96c6<\/td>\n<\/tr>\n
+<\/td>\n\u52a0\u6cd5<\/td>\n6 + 4<\/td>\n10<\/td>\n<\/tr>\n
\u2013<\/td>\n\u51cf\u6cd5<\/td>\n6 \u2013 4<\/td>\n2<\/td>\n<\/tr>\n
*<\/td>\n\u4e58\u6cd5<\/td>\n6 * 4<\/td>\n24<\/td>\n<\/tr>\n
div<\/td>\n\u9664\u6cd5<\/td>\n8 div 4<\/td>\n2<\/td>\n<\/tr>\n
=<\/td>\n\u7b49\u4e8e<\/td>\nprice=9.80<\/td>\n\u5982\u679c price \u662f 9.80\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.90\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
!=<\/td>\n\u4e0d\u7b49\u4e8e<\/td>\nprice!=9.80<\/td>\n\u5982\u679c price \u662f 9.90\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.80\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
<<\/td>\n\u5c0f\u4e8e<\/td>\nprice<9.80<\/td>\n\u5982\u679c price \u662f 9.00\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.90\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
<=<\/td>\n\u5c0f\u4e8e\u6216\u7b49\u4e8e<\/td>\nprice<=9.80<\/td>\n\u5982\u679c price \u662f 9.00\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.90\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
><\/td>\n\u5927\u4e8e<\/td>\nprice>9.80<\/td>\n\u5982\u679c price \u662f 9.90\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.80\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
>=<\/td>\n\u5927\u4e8e\u6216\u7b49\u4e8e<\/td>\nprice>=9.80<\/td>\n\u5982\u679c price \u662f 9.90\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.70\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
or<\/td>\n\u6216<\/td>\nprice=9.80 or price=9.70<\/td>\n\u5982\u679c price \u662f 9.80\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 9.50\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
and<\/td>\n\u4e0e<\/td>\nprice>9.00 and price<9.90<\/td>\n\u5982\u679c price \u662f 9.80\uff0c\u5219\u8fd4\u56de true\u3002\u5982\u679c price \u662f 8.50\uff0c\u5219\u8fd4\u56de false\u3002<\/td>\n<\/tr>\n
mod<\/td>\n\u8ba1\u7b97\u9664\u6cd5\u7684\u4f59\u6570<\/td>\n5 mod 2<\/td>\n1<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n

lxml\u7528\u6cd5<\/h2>\n

\u521d\u6b65\u4f7f\u7528<\/h3>\n

\u9996\u5148\u6211\u4eec\u5229\u7528\u5b83\u6765\u89e3\u6790 HTML \u4ee3\u7801\uff0c\u5148\u6765\u4e00\u4e2a\u5c0f\u4f8b\u5b50\u6765\u611f\u53d7\u4e00\u4e0b\u5b83\u7684\u57fa\u672c\u7528\u6cd5\u3002<\/p>\n

\n
from lxml import etree \n
text = ''' \n
<div> \n
<ul> \n
<li class=\"item-0\"><a href=\"link1.html\">first item<\/a><\/li> \n
<li class=\"item-1\"><a href=\"link2.html\">second item<\/a><\/li> \n
<li class=\"item-inactive\"><a href=\"link3.html\">third item<\/a><\/li> \n
<li class=\"item-1\"><a href=\"link4.html\">fourth item<\/a><\/li> \n
<li class=\"item-0\"><a href=\"link5.html\">fifth item<\/a> \n
<\/ul> \n
<\/div> \n
''' \n
html = etree.HTML(text) \n
result = etree.tostring(html) \n
print(result) \n
<\/code><\/pre>\n<\/p><\/div>\n

\u9996\u5148\u6211\u4eec\u4f7f\u7528 lxml \u7684 etree \u5e93\uff0c\u7136\u540e\u5229\u7528 etree.HTML \u521d\u59cb\u5316\uff0c\u7136\u540e\u6211\u4eec\u5c06\u5176\u6253\u5370\u51fa\u6765\u3002<\/p>\n

\u5176\u4e2d\uff0c\u8fd9\u91cc\u4f53\u73b0\u4e86 lxml \u7684\u4e00\u4e2a\u975e\u5e38\u5b9e\u7528\u7684\u529f\u80fd\u5c31\u662f\u81ea\u52a8\u4fee\u6b63 html \u4ee3\u7801\uff0c\u5927\u5bb6\u5e94\u8be5\u6ce8\u610f\u5230\u4e86\uff0c\u6700\u540e\u4e00\u4e2a li \u6807\u7b7e\uff0c\u5176\u5b9e\u6211\u628a\u5c3e\u6807\u7b7e\u5220\u6389\u4e86\uff0c\u662f\u4e0d\u95ed\u5408\u7684\u3002\u4e0d\u8fc7\uff0clxml \u56e0\u4e3a\u7ee7\u627f\u4e86 libxml2 \u7684\u7279\u6027\uff0c\u5177\u6709\u81ea\u52a8\u4fee\u6b63 HTML \u4ee3\u7801\u7684\u529f\u80fd\u3002<\/p>\n

\u6240\u4ee5\u8f93\u51fa\u7ed3\u679c\u662f\u8fd9\u6837\u7684<\/p>\n

\n
<html><body> \n
<div> \n
<ul> \n
<li class=\"item-0\"><a href=\"link1.html\">first item<\/a><\/li> \n
<li class=\"item-1\"><a href=\"link2.html\">second item<\/a><\/li> \n
<li class=\"item-inactive\"><a href=\"link3.html\">third item<\/a><\/li> \n
<li class=\"item-1\"><a href=\"link4.html\">fourth item<\/a><\/li> \n
<li class=\"item-0\"><a href=\"link5.html\">fifth item<\/a><\/li> \n
<\/ul> \n
<\/div> \n
<\/body><\/html><\/code><\/pre>\n<\/p><\/div>\n

\u4e0d\u4ec5\u8865\u5168\u4e86 li \u6807\u7b7e\uff0c\u8fd8\u6dfb\u52a0\u4e86 body\uff0chtml \u6807\u7b7e\u3002<\/p>\n

\u6587\u4ef6\u8bfb\u53d6<\/h3>\n

\u9664\u4e86\u76f4\u63a5\u8bfb\u53d6\u5b57\u7b26\u4e32\uff0c\u8fd8\u652f\u6301\u4ece\u6587\u4ef6\u8bfb\u53d6\u5185\u5bb9\u3002\u6bd4\u5982\u6211\u4eec\u65b0\u5efa\u4e00\u4e2a\u6587\u4ef6\u53eb\u505a hello.html\uff0c\u5185\u5bb9\u4e3a<\/p>\n

\n
<div> \n
<ul> \n
<li class=\"item-0\"><a href=\"link1.html\">first item<\/a><\/li> \n
<li class=\"item-1\"><a href=\"link2.html\">second item<\/a><\/li> \n
<li class=\"item-inactive\"><a href=\"link3.html\"><span class=\"bold\">third item<\/span><\/a><\/li> \n
<li class=\"item-1\"><a href=\"link4.html\">fourth item<\/a><\/li> \n
<li class=\"item-0\"><a href=\"link5.html\">fifth item<\/a><\/li> \n
<\/ul> \n
<\/div><\/code><\/pre>\n<\/p><\/div>\n

\u5229\u7528 parse \u65b9\u6cd5\u6765\u8bfb\u53d6\u6587\u4ef6\u3002<\/p>\n

\u540c\u6837\u53ef\u4ee5\u5f97\u5230\u76f8\u540c\u7684\u7ed3\u679c\u3002<\/p>\n

XPath\u5b9e\u4f8b\u6d4b\u8bd5<\/h3>\n

\u4f9d\u7136\u4ee5\u4e0a\u4e00\u6bb5\u7a0b\u5e8f\u4e3a\u4f8b<\/p>\n

\uff081\uff09\u83b7\u53d6\u6240\u6709\u7684 <li> \u6807\u7b7e<\/p>\n

\n
from lxml import etree \n
html = etree.parse('hello.html') \n
print type(html) \n
result = html.xpath('\/\/li') \n
print result \n
print len(result) \n
print type(result) \n
print type(result[0])<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
 # Filename : example.py<\/span>
# Copyright : 2020 By Lidihuo<\/span>
# Author by : www.lidihuo.com<\/span>
# Date : 2020-08-21<\/span>
<type 'lxml.etree._ElementTree'<\/span>>
[<Element li at 0x1014e0e18>, <Element li at 0x1014e0ef0>, <Element li at 0x1014e0f38>, <Element li at 0x1014e0f80>, <Element li at 0x1014e0fc8>]
5
<type 'list'<\/span>>
<type 'lxml.etree._Element'<\/span>> <\/span><\/code><\/pre>\n<\/p><\/div>\n

\u53ef\u89c1\uff0cetree.parse \u7684\u7c7b\u578b\u662f ElementTree\uff0c\u901a\u8fc7\u8c03\u7528 xpath \u4ee5\u540e\uff0c\u5f97\u5230\u4e86\u4e00\u4e2a\u5217\u8868\uff0c\u5305\u542b\u4e86 5 \u4e2a <li> \u5143\u7d20\uff0c\u6bcf\u4e2a\u5143\u7d20\u90fd\u662f Element \u7c7b\u578b<\/p>\n

\uff082\uff09\u83b7\u53d6 <li> \u6807\u7b7e\u7684\u6240\u6709 class<\/p>\n

\n
result = html.xpath('\/\/li\/@class') \n
print result<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
 ['item-0'<\/span><\/span>, 'item-1'<\/span><\/span>, 'item-inactive'<\/span>, 'item-1', 'item-0']
<\/span><\/code><\/pre>\n<\/p><\/div>\n

\uff083\uff09\u83b7\u53d6 <li> \u6807\u7b7e\u4e0b href \u4e3a link1.html \u7684 <a> \u6807\u7b7e<\/p>\n

\n
result = html.xpath('\/\/li\/a[@href=\"link1.html\"]') \n
print result<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
[<Element a at 0x10ffaae18>]
<\/span><\/code><\/pre>\n<\/p><\/div>\n

\uff084\uff09\u83b7\u53d6 <li> \u6807\u7b7e\u4e0b\u7684\u6240\u6709 <span> \u6807\u7b7e<\/p>\n

\u6ce8\u610f\u8fd9\u4e48\u5199\u662f\u4e0d\u5bf9\u7684<\/p>\n

\n
result = html.xpath('\/\/li\/span')<\/code><\/pre>\n<\/p><\/div>\n

\u56e0\u4e3a \/ \u662f\u7528\u6765\u83b7\u53d6\u5b50\u5143\u7d20\u7684\uff0c\u800c <span> \u5e76\u4e0d\u662f <li> \u7684\u5b50\u5143\u7d20\uff0c\u6240\u4ee5\uff0c\u8981\u7528\u53cc\u659c\u6760<\/p>\n

\n
result = html.xpath('\/\/li\/\/span') \n
print result<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
[<Element span at 0x10d698e18>]<\/code><\/pre>\n<\/p><\/div>\n

\uff085\uff09\u83b7\u53d6 <li> \u6807\u7b7e\u4e0b\u7684\u6240\u6709 class\uff0c\u4e0d\u5305\u62ec <li><\/p>\n

\n
result = html.xpath('\/\/li\/a\/\/@class') \n
print result<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
['blod']<\/code><\/pre>\n<\/p><\/div>\n

\uff086\uff09\u83b7\u53d6\u6700\u540e\u4e00\u4e2a <li> \u7684 <a> \u7684 href<\/p>\n

\n
result = html.xpath('\/\/li[last()]\/a\/@href') \n
print result<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
['link5.html']<\/code><\/pre>\n<\/p><\/div>\n

\uff087\uff09\u83b7\u53d6\u5012\u6570\u7b2c\u4e8c\u4e2a\u5143\u7d20\u7684\u5185\u5bb9<\/p>\n

\n
result = html.xpath('\/\/li[last()-1]\/a') print result[0].text<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
fourth item<\/code><\/pre>\n<\/p><\/div>\n

\uff088\uff09\u83b7\u53d6 class \u4e3a bold \u7684\u6807\u7b7e\u540d<\/p>\n

\n
result = html.xpath('\/\/*[@class=\"bold\"]') \n
print result[0].tag<\/code><\/pre>\n<\/p><\/div>\n

\u8fd0\u884c\u7ed3\u679c<\/p>\n

\n
span<\/code><\/pre>\n<\/p><\/div>\n

<\/body>
\n<\/html><\/p>\n","protected":false},"excerpt":{"rendered":"Python\u722c\u866blxmlzh-cn","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[145],"tags":[],"class_list":["post-1558","post","type-post","status-publish","format-standard","hentry","category-pythonpcjc"],"_links":{"self":[{"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/posts\/1558"}],"collection":[{"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/comments?post=1558"}],"version-history":[{"count":0,"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/posts\/1558\/revisions"}],"wp:attachment":[{"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/media?parent=1558"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/categories?post=1558"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/bianchenghao6.com\/wp-json\/wp\/v2\/tags?post=1558"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}