站长资讯网
最全最丰富的资讯网站

python爬虫代码示例分享

这篇文章主要介绍了三个python爬虫项目实例代码,使用了urllib2库,文中示例代码非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下。

python爬虫代码示例分享

python爬虫代码示例分享

一、爬取故事段子:推荐学习:Python视频教程

注:部分代码无法正常运行,但仍有一定的参考价值。

#encoding=utf-8 import urllib2   import re     class neihanba():   def spider(self):     '''     爬虫的主调度器     '''     isflow=True#判断是否进行下一页     page=1     while isflow:       url="http://www.neihanpa.com/article/list_5_"+str(page)+".html"       html=self.load(url)       self.deal(html,page)       panduan=raw_input("是否继续(y/n)!")       if panduan=="y":         isflow=True         page+=1       else:         isflow=False   def load(self,url):     '''     针对url地址进行全部爬去     :param url: url地址     :return: 返回爬去的内容     '''     header = {       "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"     }     request = urllib2.Request(url, headers=header)     response = urllib2.urlopen(request)     html = response.read()     return html   def deal(self,html,page):     '''     对之前爬去的内容进行正则匹配,匹配出标题和正文内容     :param html:之前爬去的内容     :param page: 正在爬去的页码     '''     parrten=re.compile('<li class="piclistd+">(.*?)</li>',re.S)     titleList=parrten.findall(html)     for title in titleList:       parrten1=re.compile('<a href="/article/d+.html" rel="external nofollow" >(.*)</a>')       ti1=parrten1.findall(title)       parrten2=re.compile('<div class="f18 mb20">(.*?)</div>',re.S)       til2=parrten2.findall(title)       for t in ti1:         tr=t.replace("<b>","").replace("</b>","")         self.writeData(tr,page)       for t in til2:         tr=t.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo",""").replace("&rdquo",""")         self.writeData(tr,page)   def writeData(self,context,page):     '''     将最终爬去的内容写入文件中     :param context: 匹配好的内容     :param page: 当前爬去的页码数     '''     fileName = "di" + str(page) + "yehtml.txt"     with open(fileName, "a") as file:       file.writelines(context + "n") if __name__ == '__main__':   n=neihanba()   n.spider()

二、爬取智联:

#encoding=utf-8 import urllib import urllib2   import re     class zhiLian():   def spider(self,position,workPlace):     '''     爬虫的主调度器     :param position: 职位     :param workPlace: 工作地点     '''     url="http://sou.zhaopin.com/jobs/searchresult.ashx?"     url+=urllib.urlencode({"jl":workPlace})     url+="&"     url+=urllib.urlencode({"kw":position})     isflow=True#是否进行下一页的爬去     page=1     while isflow:       url+="&"+str(page)       html=self.load(url)       self.deal1(html,page)       panduan = raw_input("是否继续爬虫下一页(y/n)!")       if panduan == "y":         isflow = True         page += 1       else:         isflow = False   def load(self,url):     '''     针对url地址进行全部爬去     :param url: url地址     :return: 返回爬去的内容     '''     header = {       "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"     }     request = urllib2.Request(url, headers=header)     response = urllib2.urlopen(request)     html = response.read()     return html   def deal1(self,html,page):     '''       对之前爬去的内容进行正则匹配,匹配职位所对应的链接     :param html:之前爬去的内容     :param page: 正在爬去的页码     '''     parrten=re.compile('<as+style="font-weight:s+bold"s+par="ssidkey=y&ss=d+&ff=d+&sg=w+&so=d+"s+href="(.*?)" rel="external nofollow" target="_blank">.*?</a>',re.S)     til=parrten.findall(html)#爬去链接     for t in til:       self.deal2(t,page)   def deal2(self,t,page):     '''     进行二次爬虫,然后在新的页面中对公司、薪资、工作经验进行匹配     :param t: url地址     :param page: 当前匹配的页数     '''     html=self.load(t)#返回二次爬虫的内容     parrten1=re.compile('<as+onclick=".*?"s+href=".*?" rel="external nofollow" s+target="_blank">(.*?)s+.*?<imgs+class=".*?"s+src=".*?"s+border="d+"s+vinfo=".*?"></a>',re.S)     parrten2=re.compile('<li><span>职位月薪:</span><strong>(.*?) <a.*?>.*?</a></strong></li>',re.S)     parrent3=re.compile('<li><span>工作经验:</span><strong>(.*?)</strong></li>',re.S)     til1=parrten1.findall(html)     til2=parrten2.findall(html)     til3=parrent3.findall(html)     str=""     for t in til1:       t=t.replace('<img title="专属页面" src="//img03.zhaopin.cn/2012/img/jobs/icon.png" border="0" />',"")       str+=t       str+="t"     for t in til2:       str+=t       str += "t"     for t in til3:       str+=t     self.writeData(str,page)   def writeData(self,context,page):     '''     将最终爬去的内容写入文件中     :param context: 匹配好的内容      :param page: 当前爬去的页码数     '''     fileName = "di" + str(page) + "yehtml.txt"     with open(fileName, "a") as file:       file.writelines(context + "n") if __name__ == '__main__':   position=raw_input("请输入职位:")   workPlace=raw_input("请输入工作地点:")   z=zhiLian()   z.spider(position,workPlace)

三、爬取贴吧:

#encoding=utf-8 import urllib import urllib2   import re     class teiba():   def spider(self,name,startPage,endPage):     url="http://tieba.baidu.com/f?ie=utf-8&"     url+=urllib.urlencode({"kw":name})     for page in range(startPage,endPage+1):       pn=50*(page-1)       urlFull=url+"&"+urllib.urlencode({"pn":pn})       html=self.loadPage(url)       self.dealPage(html,page)     def loadPage(self,url):     header={       "User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"     }     request=urllib2.Request(url,headers=header)     response=urllib2.urlopen(request)     html=response.read()     return html   def dealPage(self,html,page):     partten=re.compile(r'<as+rel="noreferrer"s+href="/p/d+" rel="external nofollow" s+title=".*?"s+target="_blank" class="j_th_tits+">(.*?)</a>',re.S)     titleList=partten.findall(html)     rstr=r'<spans+class="topic-tag"s+data-name=".*?">#(.*?)#</span>'     for title in titleList:       title=re.sub(rstr,"",title)       self.writePage(title,page)   def writePage(self,context,page):     fileName="di"+str(page)+"yehtml.txt"     with open(fileName,"a") as file:       file.writelines(context+"n") if __name__ == '__main__':   name=raw_input("请输入贴吧名:")   startPage=raw_input("请输入起始页:")   endPage=raw_input("请输入终止页:")   t=teiba()   t.spider(name,int(startPage),int(endPage))

赞(0)
分享到: 更多 (0)
网站地图   沪ICP备18035694号-2    沪公网安备31011702889846号