-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdfdoc.py
More file actions
105 lines (82 loc) · 3.66 KB
/
pdfdoc.py
File metadata and controls
105 lines (82 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import spacy as sp
from spacy_layout import spaCyLayout
import pandas as pd
import streamlit as st
import os
import json
import re
import utility as utils
MIN_SPAN_SIZE = 100
class PDFdoc():
def __init__(self, fname:str) -> None:
self.fname = fname
self.chunk_txt = []
nlp = sp.blank("en")
st.toast("Creating SpaCy Layout")
layout = spaCyLayout(nlp)
# Parsing the layout of the PDF file can take some time (5-10 minutes)
st.toast("Parsing Document Layout")
self.doc = layout(self.fname.read())
self.first_process_page = 1
self.last_process_page = len(self.doc._.layout.pages)
if 'json_file' not in st.session_state:
st.session_state.json_file = None
## PRIVATE METHODS
def __remove_url(self, para_list:list)->None:
pattern = r"((http|ftp|https):\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])?"
for index, para in enumerate(para_list):
para_list[index] = re.sub(pattern, "", para)
def __remove_html(self, para_list:list)->None:
pattern = r"<[^>]+>"
for index, para in enumerate(para_list):
para_list[index] = re.sub(pattern, "", para)
def __remove_cit(self, para_list:list)->str:
for index, para in enumerate(para_list):
# Remove citations like (Author, year) or (Author et al., year)
para = re.sub(r"\s\([A-Z][a-z]+,\s[A-Z][a-z]?\.[^\)]*,\s\d{4}\)", "", para)
# Remove bracketed numerical citations like [1], [2, 3], or [4-6]
para = re.sub(r'\[\s*\d+(?:,\s*\d+)*(?:-\s*\d+)?\s*\]', '', para)
# Remove citations like Author (Year) or Author et al. (year)
para = re.sub(r'\s[A-Z][a-zA-Z]*(?: et al\.)? \(\d{4}\)', '', para)
para_list[index] = para
# Save the pandas dataframe containing the processed data to disk
def __save_df(self):
name = self.fname.name
json_file = utils.get_fname(name)+".json"
st.session_state.json_file = json_file
df = pd.DataFrame(self.chunk_txt)
# Delete the JSON file if it already exists
if os.path.exists(json_file):
os.remove(json_file)
# Name the text output
df.columns = ['paragraphs']
# Convert DataFrame to a dictionary with lists as values
df_dict = df.to_dict(orient='list')
# Save the dictionary to a JSON file
with open(json_file, 'w') as f:
json.dump(df_dict, f, indent=2) # indent for pretty-printing
# Create the list of documents (paragraphs) using the text extracted from the PDF
def __create_doc_list(self, first:int, last:int)->None:
doc_list = []
for i in utils.Interval(first, last):
# Page count is zero-based
page = (self.doc._.pages[i-1])
# Save paragraph spans that exceed the minimum span length to a list
for section in page[1]:
if(section.label_ == "text" and len(section.text) > MIN_SPAN_SIZE):
doc_list.append(section.text)
# Clean text of URL's, HTML and citations
self.__remove_url(doc_list)
self.__remove_html(doc_list)
self.__remove_cit(doc_list)
# Chunk the text so no text span exceeds 256 words
self.chunk_txt = utils.chunk_text_by_words(doc_list)
# Save the word list as a JSON file
self.__save_df()
# PUBLIC METHODS
def get_first_page(self)->int:
return self.first_process_page
def get_last_page(self)->int:
return self.last_process_page
def process_pdf(self, first:int, last:int)->None:
self.__create_doc_list(first, last)