Python抓取开源中国资讯(使用BeautifulSoup库)

Python抓取开源中国资讯(使用BeautifulSoup库)

效果

这里写图片描述

代码

#coding=utf8
import requests
import re
import xlrd
import xlwt
import time
from bs4 import BeautifulSoup
myfile=xlwt.Workbook()
table1=myfile.add_sheet(u"9.9PDD",cell_overwrite_ok=True)
table1.write(0,0,u"资讯")

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0'
headers = { 'User-Agent' : user_agent }
#print(soup.prettify())
class sousuo():
    def __init__(self,url,table):
        self.url=url
        self.table=table

    def chaxun(self):
        url = self.url
        r=requests.get(url,headers=headers)
        #编码格式,应该大部分是UTF-8
        r.encoding = 'UTF-8' 
        html=r.text

        soup=BeautifulSoup(html, "html.parser")
        print(soup.title)
        #print(soup.prettify())


        #资讯
        i=1
        for tag in soup.find_all(class_="sc sc-text text-gradient wrap summary"):
            if tag.string is not None:
                print(tag.string)
                self.table.write(i,0,tag.string)
                i+=1

s1=sousuo('https://www.oschina.net/news/project',table1)
s1.chaxun()

filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"oschina.xlsx"
myfile.save(filename)
print(u"Python抓取开源中国资讯(使用BeautifulSoup库):%s"%time.strftime('%Y%m%d%H%M%S',time.localtime()))


已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 博客之星2020 设计师:CY__ 返回首页