BERTopic-Topic-Modeling/pdfdoc.py at main · DWHowes/BERTopic-Topic-Modeling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import spacy as sp
from spacy_layout import spaCyLayout

import pandas as pd
import streamlit as st

import os
import json
import re

import utility as utils

MIN_SPAN_SIZE = 100

class PDFdoc():
    def __init__(self, fname:str) -> None:
        self.fname = fname
        self.chunk_txt = []

        nlp = sp.blank("en")
        st.toast("Creating SpaCy Layout")
        layout = spaCyLayout(nlp)
        # Parsing the layout of the PDF file can take some time (5-10 minutes)
        st.toast("Parsing Document Layout")
        self.doc = layout(self.fname.read())

        self.first_process_page = 1
        self.last_process_page = len(self.doc._.layout.pages)

        if 'json_file' not in st.session_state:
            st.session_state.json_file = None


## PRIVATE METHODS
    def __remove_url(self, para_list:list)->None:
        pattern = r"((http|ftp|https):\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])?"
        for index, para in enumerate(para_list):
            para_list[index] = re.sub(pattern, "", para)

    def __remove_html(self, para_list:list)->None:
        pattern = r"<[^>]+>"
        for index, para in enumerate(para_list):
            para_list[index] = re.sub(pattern, "", para)

    def __remove_cit(self, para_list:list)->str:
        for index, para in enumerate(para_list):
            # Remove citations like (Author, year) or (Author et al., year)
            para = re.sub(r"\s\([A-Z][a-z]+,\s[A-Z][a-z]?\.[^\)]*,\s\d{4}\)", "", para)
            # Remove bracketed numerical citations like [1], [2, 3], or [4-6]
            para = re.sub(r'\[\s*\d+(?:,\s*\d+)*(?:-\s*\d+)?\s*\]', '', para)
            # Remove citations like Author (Year) or Author et al. (year)
            para = re.sub(r'\s[A-Z][a-zA-Z]*(?: et al\.)? \(\d{4}\)', '', para)
            para_list[index] = para

    # Save the pandas dataframe containing the processed data to disk
    def __save_df(self):
        name = self.fname.name
        json_file = utils.get_fname(name)+".json"
        st.session_state.json_file = json_file
        df = pd.DataFrame(self.chunk_txt)

        # Delete the JSON file if it already exists
        if os.path.exists(json_file):
            os.remove(json_file)

        # Name the text output
        df.columns = ['paragraphs']
        # Convert DataFrame to a dictionary with lists as values
        df_dict = df.to_dict(orient='list')
        # Save the dictionary to a JSON file
        with open(json_file, 'w') as f:
            json.dump(df_dict, f, indent=2) # indent for pretty-printing

    # Create the list of documents (paragraphs) using the text extracted from the PDF
    def __create_doc_list(self, first:int, last:int)->None:
        doc_list = []

        for i in utils.Interval(first, last):
            # Page count is zero-based
            page = (self.doc._.pages[i-1])
            # Save paragraph spans that exceed the minimum span length to a list
            for section in page[1]:
                if(section.label_ == "text" and len(section.text) > MIN_SPAN_SIZE):
                    doc_list.append(section.text)

        # Clean text of URL's, HTML and citations
        self.__remove_url(doc_list)
        self.__remove_html(doc_list)
        self.__remove_cit(doc_list)

        # Chunk the text so no text span exceeds 256 words
        self.chunk_txt = utils.chunk_text_by_words(doc_list)

        # Save the word list as a JSON file
        self.__save_df()

    # PUBLIC METHODS
    def get_first_page(self)->int:
        return self.first_process_page

    def get_last_page(self)->int:
        return self.last_process_page

    def process_pdf(self, first:int, last:int)->None:
        self.__create_doc_list(first, last)