Bouk download of journals

Nuclear Fusion

Posted by JXLIU on June 20, 2023

批量下载某一杂志 PDF 文件

以下 python 脚本可以批量下载 Nuclear Fusion 某一期 PDF 文件,以第一作者命名并保存到文件夹 NF/vol/issue

release v1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import time
import requests
from bs4 import BeautifulSoup

baseurl = 'https://iopscience.iop.org'
vol = 63
issue = 6
isbn = '/0029-5515/'
journal = 'NF'
url = baseurl + '/issue' + isbn + str(vol) + '/' +str(issue)
print(url)
#url = 'https://iopscience.iop.org/issue/0029-5515/63/8'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
#meta_list = soup.select('.small.art-list-item-meta, .mr-2.nowrap')


meta_list = soup.select('.small.art-list-item-meta')
pdf_names = []  # 用于保存 PDF 文件链接的列表for meta in meta_list:
    authors = meta.text.strip()
    print(authors)
    if ',' in authors:
        firstauthor = authors.split(',')[0].replace(' ', '')
    else:
        firstauthor = authors.strip().replace(' ', '')
    print(firstauthor)
    pdf_names.append(firstauthor+'.pdf')
    
link_list = soup.select('.mr-2.nowrap')
pdf_urls = []  # 用于保存 PDF 文件链接的列表for link in link_list:
    href = link.get('href')
    if href[-3:].upper() == 'PDF':
        pdf_url = baseurl+href
        pdf_urls.append(pdf_url)
        print(pdf_url)

## 创建文件夹os.makedirs(os.path.join(journal,str(vol),str(issue)),exist_ok=True)
## 下载 PDF
# 初始化计数器count = 0
for pdf_url, pdf_name in zip(pdf_urls, pdf_names):
    # 检查文件是否存在    if not os.path.exists(os.path.join(journal, str(vol), str(issue), pdf_name)):
        with open(os.path.join(journal, str(vol), str(issue), pdf_name), "wb") as f:
            resp = requests.get(pdf_url)
            f.write(resp.content)
        print(pdf_name + "文件已下载保存到文件夹 " + os.path.join(journal, str(vol), str(issue)))
        count += 1
    else:
        # 如果文件已经存在,则重命名文件        name, ext = os.path.splitext(pdf_name)
        i = 1
        new_name = name + "_" + str(i) + ext
        while os.path.exists(os.path.join(journal, str(vol), str(issue), new_name)):
            i += 1
            new_name = name + "_" + str(i) + ext
        with open(os.path.join(journal, str(vol), str(issue), new_name), "wb") as f:
            resp = requests.get(pdf_url)
            f.write(resp.content)
        print(new_name + "文件已下载保存到文件夹 " + os.path.join(journal, str(vol), str(issue)))
        # 计数器加 1
        count += 1
    #time.sleep(5*60)
    time.sleep(1)
    # 如果所有的 PDF 文件都已经被下载并保存,则退出循环if count == len(pdf_names):
    print('volume '+str(vol) +', issue ' + str(issue) + '已经下载完毕.')
else:
    print('下载文件数目有错。')

release v2

优化提取数据方式、命名规则,可下载多卷内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import sys
import subprocess
import time
import requests
import random
from bs4 import BeautifulSoup

for issue in range (1, 2):

    baseurl = 'https://iopscience.iop.org'
    vol = 63
    isbn = '/0029-5515/'
    journal = 'NF'
    url = baseurl + '/issue' + isbn + str(vol) + '/' +str(issue)
    print(url)
    #url = 'https://iopscience.iop.org/issue/0029-5515/63/8'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('h2', string='We apologize for the inconvenience...'):
        print(soup)
        print('Please unlock the website manually.')
        sys.exit()
    #meta_list = soup.select('.small.art-list-item-meta, .mr-2.nowrap')

    # Find all divs with class="art-list-item-body"
    art_list_item_body_divs = soup.find_all('div', {'class': 'art-list-item-body'})
    
    # Create empty lists to store values
    #indexer_list = []
    #authors_list = []
    #mr_2_nowrap_list = []
    pdf_names = []  # 用于保存 PDF 文件链接的列表    pdf_urls = []  # 用于保存 PDF 文件链接的列表    # Loop through each div and extract values of class="indexer", "small art-list-item-meta", "mr-2 nowrap"
    for div in art_list_item_body_divs:
        indexer = div.find('div', {'class': 'indexer'})
        small_art_list_item_meta = div.find('p', {'class': 'small art-list-item-meta'})
        mr_2_nowrap = div.find('div', {'class': 'art-list-item-title-wrapper'})
        a = mr_2_nowrap.find('a', {'class': 'art-list-item-title'})
        href = a.get('href')
        if small_art_list_item_meta is not None: 
            authors = small_art_list_item_meta.get_text(strip=True)
            if ',' in authors:
                firstauthor = authors.split(',')[0].replace(' ', '')
            else:
                firstauthor = authors.strip().replace(' ', '')
        else:
            print('No small art-list-item-meta found in this art-list-item-body')
            firstauthor = 'None'
        indexer = indexer.text.strip()
        title = mr_2_nowrap.text.strip().replace(' ','_')
        pdf_name = firstauthor + '_' + indexer + '_' + title + '.pdf'
        pdf_url = baseurl + href + '/pdf'
        pdf_names.append(pdf_name)
        pdf_urls.append(pdf_url)
        print(pdf_name)
        print(pdf_url)
    # sys.exit()
    # 检查 pdf_name 和 pdf_url 的一致性    print(str(issue),str(len(pdf_names)),str(len(pdf_urls)))
    if(len(pdf_names) != len(pdf_urls)):
        print('The number of authors and urls is different in volume ' + str(vol) + ', issue ' + str(issue) + '.')
        continue
    print(str(issue))
    ## 创建文件夹    os.makedirs(os.path.join(journal,str(vol),str(issue)),exist_ok=True)
    ## 下载 PDF
    # 初始化计数器    count = 0
    for pdf_url, pdf_name in zip(pdf_urls, pdf_names):
        #print(pdf_url)
        #print(pdf_name)
        # 检查文件是否存在        if not os.path.exists(os.path.join(journal, str(vol), str(issue), pdf_name)):
            with open(os.path.join(journal, str(vol), str(issue), pdf_name), "wb") as f:
                resp = requests.get(pdf_url)
                f.write(resp.content)
            print(pdf_name + "文件已下载保存到文件夹 " + os.path.join(journal, str(vol), str(issue)))
            pdfpath = os.path.join(journal, str(vol), str(issue), pdf_name)
            try:
                subprocess.check_output(['gs', '-q', '-dNOPAUSE', '-dBATCH', '-sDEVICE=nullpage', pdfpath])
                print(pdf_name + ' can be opened')
            except subprocess.CalledProcessError:
                print(pdf_name + ' is damaged or corrupted')
                sys.exit()
            count += 1
        else:
            continue
        #time.sleep(5*60)
        sleep_time = random.randint(2, 10)
        time.sleep(sleep_time)
    # 如果所有的 PDF 文件都已经被下载并保存,则退出循环    if count == len(pdf_names):
        print('volume '+str(vol) +', issue ' + str(issue) + '已经下载完毕.')
    else:
        print('共需下载 '+str(len(pdf_names))+' 个文件,已下载文件数目:'+ str(count))

release v3

去除文件名中的非法字符;优化存储方式和文件输出,增加脚本的鲁棒性。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import sys
import subprocess
import time
import requests
import random
from bs4 import BeautifulSoup
import re

# 定义非法字符的正则表达式illegal_chars_regex = re.compile(r"[\\/:*?\"<>|]")

# 定义替换后的字符replacement_char = "-"


for issue in range (1, 13):

    baseurl = 'https://iopscience.iop.org'
    vol = 62
    isbn = '/0029-5515/'
    journal = 'NF'
    url = baseurl + '/issue' + isbn + str(vol) + '/' +str(issue)
    print(url)
    #url = 'https://iopscience.iop.org/issue/0029-5515/63/8'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    if soup.find('h2', string='We apologize for the inconvenience...'):
        # print(soup)
        print('Please unlock the website manually.')
        sys.exit()
    #meta_list = soup.select('.small.art-list-item-meta, .mr-2.nowrap')

    # Find all divs with class="art-list-item-body"
    art_list_item_body_divs = soup.find_all('div', {'class': 'art-list-item-body'})
    
    # Create empty lists to store values
    #indexer_list = []
    #authors_list = []
    #mr_2_nowrap_list = []
    pdf_names = []  # 用于保存 PDF 文件链接的列表    pdf_urls = []  # 用于保存 PDF 文件链接的列表    # Loop through each div and extract values of class="indexer", "small art-list-item-meta", "mr-2 nowrap"
    for div in art_list_item_body_divs:
        indexer = div.find('div', {'class': 'indexer'})
        small_art_list_item_meta = div.find('p', {'class': 'small art-list-item-meta'})
        mr_2_nowrap = div.find('div', {'class': 'art-list-item-title-wrapper'})
        a = mr_2_nowrap.find('a', {'class': 'art-list-item-title'})
        href = a.get('href')
        if small_art_list_item_meta is not None: 
            authors = small_art_list_item_meta.get_text(strip=True)
            if ',' in authors:
                firstauthor = authors.split(',')[0].replace(' ', '')
            else:
                firstauthor = authors.strip().replace(' ', '')
        else:
            print('No small art-list-item-meta found in this art-list-item-body')
            firstauthor = 'None'
        indexer = indexer.text.strip()
        title = mr_2_nowrap.text.strip().replace(' ','_')
        pdf_name = firstauthor + '_' + indexer + '_' + title + '.pdf'
        # 将非法字符替换为指定的字符        pdf_name = illegal_chars_regex.sub(replacement_char, pdf_name)
        pdf_url = baseurl + href + '/pdf'
        pdf_names.append(pdf_name)
        pdf_urls.append(pdf_url)
        print(pdf_name)
        print(pdf_url)
    # sys.exit()
    # 检查 pdf_name 和 pdf_url 的一致性    print(str(issue),str(len(pdf_names)),str(len(pdf_urls)))
    if(len(pdf_names) != len(pdf_urls)):
        print('The number of authors and urls is different in volume ' + str(vol) + ', issue ' + str(issue) + '.')
        continue
    print(str(issue))
    ## 创建文件夹    os.makedirs(os.path.join(journal,str(vol),str(issue)),exist_ok=True)
    ## 下载 PDF
    # 初始化计数器    count = 0
    for pdf_url, pdf_name in zip(pdf_urls, pdf_names):
        print(pdf_url)
        #print(pdf_name)
        # 检查文件是否存在        if not os.path.exists(os.path.join(journal, str(vol), str(issue), pdf_name)):
            with open(os.path.join(journal, str(vol), str(issue), pdf_name), "wb") as f:
                resp = requests.get(pdf_url)
                f.write(resp.content)
            print(pdf_name + "文件已下载保存到文件夹 " + os.path.join(journal, str(vol), str(issue)))
            pdfpath = os.path.join(journal, str(vol), str(issue), pdf_name)
            try:
                subprocess.check_output(['gs', '-q', '-dNOPAUSE', '-dBATCH', '-sDEVICE=nullpage', pdfpath])
                print(pdf_name + ' can be opened')
            except subprocess.CalledProcessError:
                print(pdf_name + ' is damaged or corrupted')
                htmlpath = os.path.join(journal, str(vol), str(issue), pdf_name +'.html')
                os.rename(pdfpath,htmlpath)
                sys.exit()
            count += 1
        else:
            continue
        #time.sleep(5*60)
        sleep_time = random.randint(300, 900)
        time.sleep(sleep_time)
    # 如果所有的 PDF 文件都已经被下载并保存,则退出循环    if count == len(pdf_names):
        print('volume '+str(vol) +', issue ' + str(issue) + '已经下载完毕.')
    else:
        print('共需下载 '+str(len(pdf_names))+' 个文件,已下载文件数目:'+ str(count))