Spaces:

asach
/

arxiv-plagiarism-checker-Ilm

Runtime error

App Files Files Community

gamingflexer commited on Feb 1, 2024

Commit

9269cc3

1 Parent(s): eb08878

Pdf Text extraction

Browse files

Files changed (1) hide show

src/plagiarism/preprocessing.py +142 -0

src/plagiarism/preprocessing.py CHANGED Viewed

@@ -1,5 +1,147 @@
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 def remove_numbers(words_list: list) -> list:

 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
+import time
+from operator import itemgetter
+import fitz
+import re
+def fonts(doc, granularity=False):
+    """Extracts fonts and their usage in PDF documents.
+    :param doc: PDF document to iterate through
+    :type doc: <class 'fitz.fitz.Document'>
+    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
+    :type granularity: bool
+    :rtype: [(font_size, count), (font_size, count}], dict
+    :return: most used fonts sorted by count, font style information
+    """
+    styles = {}
+    font_counts = {}
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:  # iterate through the text blocks
+            if b['type'] == 0:  # block contains text
+                for l in b["lines"]:  # iterate through the text lines
+                    for s in l["spans"]:  # iterate through the text spans
+                        if granularity:
+                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
+                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
+                                                  'color': s['color']}
+                        else:
+                            identifier = "{0}".format(s['size'])
+                            styles[identifier] = {'size': s['size'], 'font': s['font']}
+                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage
+    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)
+    if len(font_counts) < 1:
+        raise ValueError("Zero discriminating fonts found!")
+    return font_counts, styles
+def font_tags(font_counts, styles):
+    """Returns dictionary with font sizes as keys and tags as value.
+    :param font_counts: (font_size, count) for all fonts occuring in document
+    :type font_counts: list
+    :param styles: all styles found in the document
+    :type styles: dict
+    :rtype: dict
+    :return: all element tags based on font-sizes
+    """
+    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
+    p_size = p_style['size']  # get the paragraph's size
+    # sorting the font sizes high to low, so that we can append the right integer to each tag
+    font_sizes = []
+    for (font_size, count) in font_counts:
+        font_sizes.append(float(font_size))
+    font_sizes.sort(reverse=True)
+    # aggregating the tags for each font size
+    idx = 0
+    size_tag = {}
+    for size in font_sizes:
+        idx += 1
+        if size == p_size:
+            idx = 0
+            size_tag[size] = '<p>'
+        if size > p_size:
+            size_tag[size] = '<h{0}>'.format(idx)
+        elif size < p_size:
+            size_tag[size] = '<s{0}>'.format(idx)
+    return size_tag
+def headers_para(doc, size_tag):
+    """Scrapes headers & paragraphs from PDF and return texts with element tags.
+    :param doc: PDF document to iterate through
+    :type doc: <class 'fitz.fitz.Document'>
+    :param size_tag: textual element tags for each size
+    :type size_tag: dict
+    :rtype: list
+    :return: texts with pre-prended element tags
+    """
+    paragraphs = []  # list with paragraphs
+    first = True  # boolean operator for first header
+    previous_s = {}  # previous span
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:  # iterate through the text blocks
+            if b['type'] == 0:  # this block contains text
+                # REMEMBER: multiple fonts and sizes are possible IN one block
+                block_string = ""  # text found in block
+                for l in b["lines"]:  # iterate through the text lines
+                    for s in l["spans"]:  # iterate through the text spans
+                        if s['text'].strip():  # removing whitespaces:
+                            if first:
+                                previous_s = s
+                                first = False
+                                block_string = s['text'] if size_tag[s['size']] == '<p>' else ''
+                            else:
+                                if s['size'] == previous_s['size']:
+                                    if block_string:  # in the same block, so concatenate strings
+                                        block_string += " " + s['text']
+                                else:
+                                    if block_string:  # new block has started, so append the paragraph
+                                        paragraphs.append(block_string)
+                                    block_string = s['text'] if size_tag[s['size']] == '<p>' else ''
+                                previous_s = s
+                if block_string:  # append the last paragraph in the block
+                    if len(block_string) > 80:
+                        # print(len(block_string), block_string,'\n')
+                        paragraphs.append(block_string)
+    return paragraphs
+def get_pdf_info(document_path):
+    docs = fitz.open(document_path)
+    only_text = ""
+    for page in docs:
+        only_text += page.get_text() + " "
+    font_counts, styles = fonts(docs, granularity=False)
+    size_tag = font_tags(font_counts, styles)
+    elements = headers_para(docs, size_tag)
+    paragraphs = []
+    for element in elements:
+        if len(element) > 100:
+            paragraphs.append(element.lower())
+    pattern = r'\d+(\.\d+)?\n'
+    cleaned_text = re.sub(pattern, '', only_text)
+    return cleaned_text.lower(),paragraphs
 def remove_numbers(words_list: list) -> list: