前言本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理 。
作者: Star_Zhao
PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取http://t.cn/A6Zvjdun
文章插图
本次爬取自如网房源信息所用到的知识点:
- requests get请求
- lxml解析html
- Xpath
- MongoDB存储
- url: http://hz.ziroom.com/z/nl/z3.html?p=2 的p参数控制分页
- get请求
# -*- coding: utf-8 -*-import requestsimport timefrom requests.exceptions import RequestExceptiondef get_one_page(page):try:url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)headers = {'Referer':'http://hz.ziroom.com/','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0(windowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'}res = requests.get(url,headers=headers)if res.status_code == 200:print(res.text)except RequestException:return Nonedef main():page = 1get_one_page(page)if __name__ == '__main__':main()time.sleep(1)
解析单页源码- 解析html文档, 目的: 测试XPath表达式
from lxml import etree#解析html文档html = etree.parse("./resul.html",etree.HTMLParser())results = html.xpath('//ul[@id="houseList"]/li')for result in results[1:]:title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) >5 else ""location = result.xpath("./div/h4/a/text()")[0].replace("[","").replace("]",'')area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ","",1) #使用join方法将列表中的内容以" "字符连接nearby = result.xpath("./div/div/p[2]/span/text()")[0]print(title)print(location)print(area)print(nearby)
解析源代码from lxml import etreedef parse_one_page(sourcehtml):'''解析单页源码'''contentTree = etree.HTML(sourcehtml)#解析源代码results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相应内容for result in results[1:]:title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1)# 使用join方法将列表中的内容以" "字符连接nearby = result.xpath("./div/div/p[2]/span/text()")[0]yield {"title": title,"location": location,"area": area,"nearby": nearby}def main():page = 1html = get_one_page(page)print(type(html))parse_one_page(html)for item in parse_one_page(html):print(item)if __name__ == '__main__':main()time.sleep(1)
获取多个页面def parse_one_page(sourcehtml):'''解析单页源码'''contentTree = etree.HTML(sourcehtml)#解析源代码results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相应内容for result in results[1:]:title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1)# 使用join方法将列表中的内容以" "字符连接#nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()这里需要加判断, 改写为下句nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""yield {"title": title,"location": location,"area": area,"nearby": nearby}print(nearby)#yield {"pages":pages}def get_pages():"""得到总页数"""page = 1html = get_one_page(page)contentTree = etree.HTML(html)pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共页"))return pagesdef main():pages = get_pages()print(pages)for page in range(1,pages+1):html = get_one_page(page)for item in parse_one_page(html):print(item)if __name__ == '__main__':main()time.sleep(1)
推荐阅读
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 手把手教你用Python进行SSH暴力破解
- Python自建免费HTTP服务器,无公网IP也能远程访问
- CentOS7安装Python3.x
- 10行代码教你用python鉴黄
- 让python克隆别人的声音
- Python自动整理文件夹
- Python爬虫快速入门,静态网页爬取
- Python多线程死锁问题的巧妙解决方法
- 3种方法实现python-matplotlib显示中文
- python第三方库uiautomator2 操作方法汇总