豆瓣电影&唐诗

7-23 459 views

利用爬虫获取些电影信息:

# -*- coding: utf-8 -*-
import urllib2
from HTMLParser import HTMLParser

class MoviesParser(HTMLParser):           #定义一个解析器的类
    def __init__(self):
        HTMLParser.__init__(self)
        self.movies = []      #将找到的电影放进此列表

    print("现在热映的电影信息:")
    def handle_starttag(self, tag, attrs):
        def _attr(attrlist, attrname):
            for attr in attrlist:
                if attr[0] == attrname:
                    return attr[1]
            return None
        if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs, 'data-category') == 'nowplaying':
            movie = {} #定义一个字典,用来装获取到的电影信息
            movie['title'] = _attr(attrs,'data-title')       #电影名
            movie['release'] = _attr(attrs,'data-release')   #电影上映年份
            movie['score'] = _attr(attrs,'data-score')         #电影豆瓣评分
            movie['duration'] = _attr(attrs,'data-duration') #电影时长
            movie['region'] = _attr(attrs,'data-region')     #电影产地
            movie['director'] = _attr(attrs,'data-director') #导演
            movie['actors'] = _attr(attrs,'data-actors')     #演员
            self.movies.append(movie)
            print("片名:%(title)s||年代:%(release)s||评分:%(score)s||时长:%(duration)s||产地:%(region)s||导演:%(director)s||演员:%(actors)s\n" % movie )


        if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs, 'data-category') == 'upcoming':
            movie2 = {'head':'即将上映的电影信息:'}
            movie2['title'] = _attr(attrs,'data-title')       #电影名
            movie2['duration'] = _attr(attrs,'data-duration') #电影时长
            movie2['director'] = _attr(attrs,'data-director') #导演
            movie2['actors'] = _attr(attrs,'data-actors')     #演员
            self.movies.append(movie2)
            print("%(head)s||片名:%(title)s||时长:%(duration)s||导演:%(director)s||演员:%(actors)s\n" % movie2 )</pre>
    def handle_data(self, data):     #获取标签中数据的函数
        if self.lasttag == 'li':    #若标签是li的则获取它标签里的数据
            times = data.strip()     #获取上映时间
            print("%s\t" %times),   #不换行输出



def new_movies(url):
    headers = {'User-Agent': 'Mozilla/5.0'}  #定制一个headers(http头文件)
    req = urllib2.Request(url,headers=headers)  #定义一个request请求
    s = urllib2.urlopen(req) #定义一个发送request请求
    # print("现在热映的电影信息:")
    parser = MoviesParser() #调用解析器,解析服务器发过来的数据
    parser.feed(s.read()) #将数据喂进去
    s.close()
    return parser.movies;


if __name__ == '__main__':
    url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
    movies = new_movies(url)



    # import json
    # print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',',': ')))

效果图: 


爬取唐诗及对应url:

</pre>
<pre># -*- coding: utf-8 -*-
import requests
from HTMLParser import HTMLParser


def _attr(attrlist, attrname):
    for attr in attrlist:
        if attr[0] == attrname:
            return attr[1]
    return None


class parser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.tangshi_list = []
        self.in_span = False
        self.in_a = False
        self.tangshi = {}

    def handle_starttag(self, tag, attrs):
        if tag == 'span':
            self.in_span = True
            # print(tag)
        if self.in_span and tag == 'a':
            self.in_a = True
            self.tangshi['url'] = _attr(attrs,'href')
            # print(tag)


    def handle_endtag(self, tag):
        if tag == 'span':
            self.in_span = False
        if tag == 'a':
            self.in_a = False
    def handle_data(self, data):
        if self.in_a:
            # print(data)
            self.tangshi['title'] = data
            # self.tangshi_list.append(self.tangshi)

        if self.in_span and self.in_a == False:
            # print(data)
            self.tangshi['author'] = data
            self.tangshi_list.append(self.tangshi)
            self.tangshi={}




def TSpare():
    info = requests.get('https://so.gushiwen.org/gushi/tangshi.aspx')
    p = parser()
    p.feed(info.content)
    return p.tangshi_list



if __name__ == '__main__':
    l = TSpare()
    for i in range(len(l)):
        print('%(title)s\t作者%(author)s\tURL:https://so.gushiwen.org%(url)s' %(l[i]))
    print('总共{}首唐诗'.format(len(l)))

效果图:

python的zipfie简单应用

一、场景案例 需求:当开发上传某一个zip包时,自动解压到当前目录。已经解压过得zip包不再解压! 此处用到的模块有Python3自有模块os,sys,time,json,re,zipf...

阅读全文

vip电影的解析

一、出现的原因 由于各大互联网视频app均推出了“轻奢主义”的营销模式,导致了很多优质视频需要我们办这个月卡、年卡才能观看,更离谱的是腾讯最近推出了会员...

阅读全文

logging日志模块

一、日志输出合理的必要性 日式收集使我们日常工作中都会遇到的问题,而一个好的日志输出,则会给收集工作带来大大的效率提升。同时能够给程序员自己排错带来...

阅读全文

欢迎留言