豆瓣电影&唐诗

7-23 380 views

利用爬虫获取些电影信息:

# -*- coding: utf-8 -*-
import urllib2
from HTMLParser import HTMLParser

class MoviesParser(HTMLParser):           #定义一个解析器的类
    def __init__(self):
        HTMLParser.__init__(self)
        self.movies = []      #将找到的电影放进此列表

    print("现在热映的电影信息:")
    def handle_starttag(self, tag, attrs):
        def _attr(attrlist, attrname):
            for attr in attrlist:
                if attr[0] == attrname:
                    return attr[1]
            return None
        if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs, 'data-category') == 'nowplaying':
            movie = {} #定义一个字典,用来装获取到的电影信息
            movie['title'] = _attr(attrs,'data-title')       #电影名
            movie['release'] = _attr(attrs,'data-release')   #电影上映年份
            movie['score'] = _attr(attrs,'data-score')         #电影豆瓣评分
            movie['duration'] = _attr(attrs,'data-duration') #电影时长
            movie['region'] = _attr(attrs,'data-region')     #电影产地
            movie['director'] = _attr(attrs,'data-director') #导演
            movie['actors'] = _attr(attrs,'data-actors')     #演员
            self.movies.append(movie)
            print("片名:%(title)s||年代:%(release)s||评分:%(score)s||时长:%(duration)s||产地:%(region)s||导演:%(director)s||演员:%(actors)s\n" % movie )


        if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs, 'data-category') == 'upcoming':
            movie2 = {'head':'即将上映的电影信息:'}
            movie2['title'] = _attr(attrs,'data-title')       #电影名
            movie2['duration'] = _attr(attrs,'data-duration') #电影时长
            movie2['director'] = _attr(attrs,'data-director') #导演
            movie2['actors'] = _attr(attrs,'data-actors')     #演员
            self.movies.append(movie2)
            print("%(head)s||片名:%(title)s||时长:%(duration)s||导演:%(director)s||演员:%(actors)s\n" % movie2 )</pre>
    def handle_data(self, data):     #获取标签中数据的函数
        if self.lasttag == 'li':    #若标签是li的则获取它标签里的数据
            times = data.strip()     #获取上映时间
            print("%s\t" %times),   #不换行输出



def new_movies(url):
    headers = {'User-Agent': 'Mozilla/5.0'}  #定制一个headers(http头文件)
    req = urllib2.Request(url,headers=headers)  #定义一个request请求
    s = urllib2.urlopen(req) #定义一个发送request请求
    # print("现在热映的电影信息:")
    parser = MoviesParser() #调用解析器,解析服务器发过来的数据
    parser.feed(s.read()) #将数据喂进去
    s.close()
    return parser.movies;


if __name__ == '__main__':
    url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
    movies = new_movies(url)



    # import json
    # print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',',': ')))

效果图: 


爬取唐诗及对应url:

</pre>
<pre># -*- coding: utf-8 -*-
import requests
from HTMLParser import HTMLParser


def _attr(attrlist, attrname):
    for attr in attrlist:
        if attr[0] == attrname:
            return attr[1]
    return None


class parser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.tangshi_list = []
        self.in_span = False
        self.in_a = False
        self.tangshi = {}

    def handle_starttag(self, tag, attrs):
        if tag == 'span':
            self.in_span = True
            # print(tag)
        if self.in_span and tag == 'a':
            self.in_a = True
            self.tangshi['url'] = _attr(attrs,'href')
            # print(tag)


    def handle_endtag(self, tag):
        if tag == 'span':
            self.in_span = False
        if tag == 'a':
            self.in_a = False
    def handle_data(self, data):
        if self.in_a:
            # print(data)
            self.tangshi['title'] = data
            # self.tangshi_list.append(self.tangshi)

        if self.in_span and self.in_a == False:
            # print(data)
            self.tangshi['author'] = data
            self.tangshi_list.append(self.tangshi)
            self.tangshi={}




def TSpare():
    info = requests.get('https://so.gushiwen.org/gushi/tangshi.aspx')
    p = parser()
    p.feed(info.content)
    return p.tangshi_list



if __name__ == '__main__':
    l = TSpare()
    for i in range(len(l)):
        print('%(title)s\t作者%(author)s\tURL:https://so.gushiwen.org%(url)s' %(l[i]))
    print('总共{}首唐诗'.format(len(l)))

效果图:

python参数

一、位置参数 调用函数时根据函数定义的参数位置来传递参数。 #!/usr/bin/env python # coding=utf-8 def print_hello(name, sex): sex_dict = {1...

阅读全文

zabbix–api接口

Zabbix_api4.4官方文档 Zabbix_api3.4中文文档 一、初识api(zabbix4.4.4版本) API(Application Programming Interface,应用程序编程接口)是一些预先定义的...

阅读全文

将配置文件构造成json格式让zabbix自动发现监控项

一、简单介绍 上文我们使用了zabbix的自动发现的监控来监控服务的端口,重点写了zabbix的发现的一个原理和过程,但是实际上我们服务器的配置是比较复杂的,而...

阅读全文

欢迎留言