前言
学习python一段时间了,也了解过爬虫,觉得甚是有趣,于是想自己动手试试,因此诞生了我的第一个爬虫。
环境
使用pip
安装beautifulsoup4
(先翻墙,不然pip会报timeout
错误),beautifulsoup
省去了繁复的正则,使得开发起来更加友好。
sudo -H pip --default-timeout=100 install beautifulsoup4
实现
以下程序主要实现了一个小功能即爬取链家网
苏州工业园区100平以下的所有二手房房源,并且生成csv文件(可由excel打开),ps:网站dom会变更代码不具时效性😄。
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import time
import requests
import urllib2
import codecs
reload(sys)
sys.setdefaultencoding('utf8')
from bs4 import BeautifulSoup
class houseInfo:
def __init__(self,title = '',address = '',square =' ',huxing = '',priceInfo = '',unitPrice = '',link = ''):
self.title = title
self.address = address
self.square = square
self.huxing = huxing
self.priceInfo = priceInfo
self.unitPrice = unitPrice
self.link = link
list = []
def grab(url):
# 使用requests或者urllib2都可以
# request = urllib2.Request(url)
# response = urllib2.urlopen(request)
# plain_text = response.read()
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,"html.parser")
list_soup = soup.find("ul", class_="house-lst").find_all('li');
for item in list_soup:
title = item.find("div", class_="info-panel").a.string
address = item.find("div", class_="info-panel").find("div", class_="col-1").find("div", class_="where").a.string
square = item.find("div", class_="info-panel").find("div", class_="col-1").find("div", class_="where").find_all('span')[2].string
huxing = item.find("div", class_="info-panel").find("div", class_="col-1").find("div", class_="where").find_all('span')[1].string
priceInfo = item.find("div", class_="info-panel").find("div", class_="col-3").find("div", class_="price").span.string
unitPrice = item.find("div", class_="info-panel").find("div", class_="col-3").find("div", class_="price-pre").string
link = 'http://su.lianjia.com'+item.find("div", class_="info-panel").a.get('href')
info = houseInfo(title,address,square,huxing,priceInfo,unitPrice)
single = '%s,%s,%s,%s,%s万,%s,%s\n'%(title,address,square,huxing,priceInfo,unitPrice,link)
list.append(single)
for i in range(1, 23):
page = "http://su.lianjia.com/ershoufang/gongyeyuan/d{0}p1/".format(str(i))
grab(page)
with open('data.csv', 'w') as f:
for item in list:
# BOM_UTF8是为了excel中文不乱码
f.write(codecs.BOM_UTF8.decode('utf-8'))
f.write(item)