随笔三：爬取喜马拉雅音频

郑统京 · 发表于 2023-5-23 00:09:45

import datetime
import time
import requests
import uuid

from setting import *

content_dict ={
"儿歌":["424529","245037"],
"故事":["12891461","260744"],
"古诗":["15161417","12914364"],
}

def get_content_xmly(content_dict:dict):
# 专辑ID和页码？
for tags,a_list in content_dict.items():
      for aid in a_list:
         # 专辑循环结束后，开启下一个循环page就又从1开始了
         page=1
         while True:
            content_list=requests.get(CONTENT_LIST_URL%(aid,page),headers=HEADERS).json()
            page+=1
            # 导入MongoDB的时候，一次性导入多条，所以放到列表中，当换专辑切换的时候列表再次清空
            content_mongo_list=[]
            # 如果爬到的内容为空就断开
            if not content_list.get('data').get('trackDetailInfos'):
                  break
            for content in content_list.get('data').get('trackDetailInfos'):
                  # 获取音乐名字
                  music_name = content.get('trackInfo').get('title')

                  music_url = content.get('trackInfo').get('playPath')
                  music_file_name = uuid.uuid4()
                  # 将配置的路径拼接到文件里面
                  music_path=os.path.join(MUSIC_PATH,f"{music_file_name}.mp3")
                  image_path=os.path.join(COVER_PATH,f"{music_file_name}.jpg")
                  # 图片地址--》host+图片的url
                  music_image = IMAGE_HOST + content.get('trackInfo').get('cover')
                  # 把数据流写如文件里面去（图片和音乐并保存）
                  music = requests.get(music_url).content
                  with open(music_path,'wb') as f:
                     f.write(music)
                  music_image = requests.get(music_image).content
                  with open(image_path, 'wb') as f:
                     f.write(music_image)
                  # 写入Mongodb里面去
                  content_info={
                     'title':music_name,
                     'music':f'{music_file_name}.mp3', # 存放的是音乐的名字，而不是音乐内容否则数据库得爆炸
                     'cover':f'{music_file_name}.jpg', # 存放的是音乐的图片，而不是音乐图片否则数据库得爆炸
                     'tag':tags,
                     'createTime':datetime.datetime.now()
                  }
                  # 批量导入多条数据到数据库里面
                  content_mongo_list.append(content_info)
                  # 设置爬取等待时间
                  time.sleep(1)
            # 批量插入，content_list设置为空就是一页结束后自动清空
            MongoDB.content.insert_many(content_list)
# 运行函数
get_content_xmly(content_dict)

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

随笔三：爬取喜马拉雅音频

浏览过的版块

扫码加入运维网微信交流群

随笔三 ： 爬取喜马拉雅音频

浏览过的版块

随笔三：爬取喜马拉雅音频