forked from northy/pdf-to-markdown-workflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split.py
45 lines (37 loc) · 1.06 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import fitz, sys, os
pdf_input = sys.argv[1]
output_folder = sys.argv[2]
source_md = sys.argv[3]
output_md = sys.argv[4]
output_folder_root = sys.argv[5]+output_folder
line_end = sys.argv[6].replace('\\n','\n')
if not os.path.exists(output_folder_root):
os.makedirs(output_folder_root)
doc = fitz.open(pdf_input)
source = open(source_md,'r',encoding='utf-8')
lines = source.readlines()
source.close()
s = 0
e = 0
for i in range(len(lines)) :
if lines[i].find("<!-- PDF-TO-MARKDOWN:START -->")!=-1 : s = i
if lines[i].find("<!-- PDF-TO-MARKDOWN:END -->")!=-1 : e = i
for i in range(e-s-1) :
del lines[s+1]
i=1
imgs = []
while True :
try :
page = doc.load_page(i-1)
pix = page.get_pixmap()
output = f"{output_folder}/page{i}.png"
output_root = f"{output_folder_root}/page{i}.png"
imgs.append(f'![Page {i}]({output} "Page {i}")\n{line_end}')
pix.save(output_root)
except :
break
i+=1
lines = lines[:s+1]+imgs+lines[s+1:]
md = open(output_md,'w',encoding='utf-8')
md.writelines(lines)
md.close()