爬取视频小测试

7-18 710 views

一、爬取大致思路

首先找打你要爬取的视频网址,用urllib解析出含有mp4视频地址网址源码,若第一次没解析出,可嵌套多次解析,直到解析出mp4视频地址的源码,解析出的源码通过re匹配出视频地址,再通过requests模块下载视频。

二、实例网址分析

http://www.maiziedu.com/course/645-9571/  一个教学视频网址,某些内容貌似要会员才能看,我们没钱的穷人看不起,只能花点心思爬下来慢慢看、

可以看到网站左边是课程的目录连接地址,我们要做的就是把这些地址找出来,然后在过滤其中视频的地址,然后下载到本地即可。

先看课程连接地址长啥样,方便re匹配,鼠标右键,查看网页源代码:

很容易的可以看出,这些应该是课程目录地址,我们随机进入一个地址,看看视频地址长啥样,同样进去网页后,右键查看源代码

很容易看出这就是视频的地址,我们最终想要得到的数据。下面开始撸点代码了

三、测试下载代码

# -*- coding: utf-8 -*-
import re
import  urllib
import requests


def getHtml(url):                    #获取网页源代码函数
    page = urllib.urlopen(url)    
    html = page.read()
        # .decode("utf-8")
    return html


def getmp4(html):       #获取视频地址函数
    fage = re.compile('.*lessonUrl = "(http.*?mp4)"', re.I)   #这里这个()里才是匹配出来的内容
    mp4list = re.findall(fage,html)
    return mp4list

def geturl(url):       #获取课程目录链接地址
    f = re.compile('<a href="(/course/645-.*/)"',re.I)        #这个()里才是匹配出来的内容
    urllist = re.findall(f,url)
    return urllist

url = 'http://www.maiziedu.com/course/645-9571/'  
html = getHtml(url)
mp4url = geturl(html)  
print(mp4url)  
for i in mp4url:
    URL = 'http://www.maiziedu.com' + i      #完整的视频网址
    HTML = getHtml(URL)
    # print ("MP4连接地址如下:")
    name = i.split('/')[2]        #下载后文件名
    info = getmp4(HTML)     #获取到视频地址
    print(info)
    print("开始下载{}".format(name))
    r = requests.get(info[0])
    with open("mp4/{}.mp4".format(name),"wb") as code:  #将视频下载到当前目录的mp4目录下
        code.write(r.content)
    print("下载完成{}".format(name))

执行效果图:


这个课程这些视频就被下载下来了。
代码可优化地方,下载进度看不到。这个可以弄下,更友好一点。
根据url判断url文件大小代码:
# -*- coding: utf-8 -*-
import urllib2
def getRemoteFileSize(url, proxy=None):
    """ 通过content-length头获取远程文件大小
        url - 目标文件URL
        proxy - 代理  """
    opener = urllib2.build_opener()
    if proxy:
        if url.lower().startswith('https://'):
            opener.add_handler(urllib2.ProxyHandler({'https' : proxy}))
        else:
            opener.add_handler(urllib2.ProxyHandler({'http' : proxy}))
    try:
        request = urllib2.Request(url)
        request.get_method = lambda: 'HEAD'
        response = opener.open(request)
        response.read()
    except Exception, e: # 远程文件不存在
        return 0
    else:
        fileSize = dict(response.headers).get('content-length', 0)
        return int(fileSize)


url='http://newoss.maiziedu.com/pcjc/pcjc-01.mp4'
allcount = getRemoteFileSize(url)
print(allcount)

下载url文件是获取下载进度及下载速度代码:
# -*- coding: utf-8 -*-
import requests
from contextlib import closing
import time
import re
import  urllib</pre>
<pre>def download_file(url, path):
    with closing(requests.get(url, stream=True)) as r:
        chunk_size = 1024*10
        content_size = int(r.headers['content-length'])
        print '下载开始'
        with open(path, "wb") as f:
            p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
            for chunk in r.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                p.output()


class ProgressData(object):

    def __init__(self, block,size, unit, file_name='', ):
        self.file_name = file_name
        self.block = block/1000.0
        self.size = size/1000.0
        self.unit = unit
        self.count = 0
        self.start = time.time()
    def output(self):
        self.end = time.time()
        self.count += 1
        speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
        self.start = time.time()
        loaded = self.count*self.block
        progress = round(loaded/self.size, 4)
        if loaded >= self.size:
            print u'\n%s下载完成\r\n'%self.file_name
        else:
            print ('\r{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s').\
                  format(self.file_name, loaded, self.unit,\
                  self.size, self.unit, progress, speed, self.unit),

url = "http://XXXXX"
path = "XXX/XXX.mp4"
download_file(url, path)

下面将这两段代码加入之前爬取视频代码中去,实现下载视频时获取下载进度及下载进度:

# -*- coding: utf-8 -*-
import re
import urllib
import requests
import urllib2
import time
import os
from contextlib import closing


def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
        # .decode("utf-8")
    return html



def getmp4(html):
    fage = re.compile('.*lessonUrl = "(http.*?mp4)"', re.I)
    mp4list = re.findall(fage,html)
    return mp4list

def geturl(url):
    f = re.compile('<a href="(/course/645-.*/)"',re.I)
    urllist = re.findall(f,url)
    return urllist

def getRemoteFileSize(url, proxy=None):      #获取要下载的文件大小
    """ 通过content-length头获取远程文件大小
        url - 目标文件URL
        proxy - 代理  """
    opener = urllib2.build_opener()
    if proxy:
        if url.lower().startswith('https://'):
            opener.add_handler(urllib2.ProxyHandler({'https' : proxy}))
        else:
            opener.add_handler(urllib2.ProxyHandler({'http' : proxy}))
    try:
        request = urllib2.Request(url)
        request.get_method = lambda: 'HEAD'
        response = opener.open(request)
        response.read()
    except Exception, e: # 远程文件不存在
        return 0
    else:
        fileSize = dict(response.headers).get('content-length', 0)
        return int(fileSize)


class ProgressData(object):

    def __init__(self, block,size, unit, file_name='', ):
        self.file_name = file_name
        self.block = block/1000.0
        self.size = size/1000.0
        self.unit = unit
        self.count = 0
        self.start = time.time()
    def output(self):
        self.end = time.time()
        self.count += 1
        speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
        self.start = time.time()
        loaded = self.count*self.block
        progress = round(loaded/self.size, 4)
        if loaded >= self.size:
            print u'\n%s下载完成\r\n'%self.file_name
        else:
            print ('\r{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s').\
                  format(self.file_name, loaded, self.unit,\
                  self.size, self.unit, progress, speed, self.unit),

def download_file(url, path):
    with closing(requests.get(url, stream=True)) as r:
        chunk_size = 1024*10
        content_size = int(r.headers['content-length'])
        print '下载开始'
        with open(path, "wb") as f:
            p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
            for chunk in r.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                p.output()


url = 'http://www.maiziedu.com/course/645'

html = getHtml(url)
mp4url = geturl(html)
# print(mp4url)
for i in mp4url:
    URL = 'http://www.maiziedu.com' + i
    HTML = getHtml(URL)
    # print ("MP4连接地址如下:")
    name = i.split('/')[2]
    info = getmp4(HTML)
    print(info)
    print("开始下载{}".format(name))
    allcount = getRemoteFileSize(info[0])
    if allcount == 0:
        print("{}远程文件不存在".format(name))
        exit(1)
    else:
        print('{}文件大小:{}Byte'.format(name,allcount))
    path = 'mp4/' + name + '.mp4'
    download_file(info[0], path)
    size_check = os.path.getsize("{}".format(path))
    print("{}下载后文件大小为{}Byte".format(name,size_check))
    if size_check == allcount:
        print("{}下载成功,文件大小校验一致!".format(name))
    else:
        print("{}文件下载缺失部分数据,文件大小校验不一致,可能无法使用该文件!".format(name))

下载该代码[download]
执行效果图:

python参数

一、位置参数 调用函数时根据函数定义的参数位置来传递参数。 #!/usr/bin/env python # coding=utf-8 def print_hello(name, sex): sex_dict = {1...

阅读全文

zabbix–api接口

Zabbix_api4.4官方文档 Zabbix_api3.4中文文档 一、初识api(zabbix4.4.4版本) API(Application Programming Interface,应用程序编程接口)是一些预先定义的...

阅读全文

将配置文件构造成json格式让zabbix自动发现监控项

一、简单介绍 上文我们使用了zabbix的自动发现的监控来监控服务的端口,重点写了zabbix的发现的一个原理和过程,但是实际上我们服务器的配置是比较复杂的,而...

阅读全文

2 条评论

欢迎留言