Source code for pdf_analyzer.api.visualize.word_cloud

from pdf_analyzer.uploaders.file_uploader import FileUploader
from pdf_analyzer.api.grobid_client_python.grobid_client.grobid_client import GrobidClient
from pdf_analyzer.api.API import BaseAPI
from pdf_analyzer.logger import logging
from bs4 import BeautifulSoup
from omegaconf import DictConfig
from wordcloud import WordCloud as WC
import matplotlib.pyplot as plt

import os
import time



[docs]class WordCloud(BaseAPI): """_summary_ WordCloud when initialise creates word clouds from the pdfs it will use the config file for the differente behaviours """ def __init__(self,api_config:DictConfig,server_config:DictConfig): try: super().__init__(api_config,server_config) self.word_clouds =self.proccesed_files except Exception as e: logging.error("Error with WordCloud creation :"+str(e)) raise ValueError(e)
[docs] def process_file(self,file_path): file_name = self.extract_file_name(file_path) xml_file = open(file_path,"r") soup = BeautifulSoup(xml_file,"xml") xml_file.close() abstract_blocks = soup.find_all('abstract') all_p_string="" for abstract in abstract_blocks: p_elements = abstract.find_all("p") for p in p_elements: all_p_string=all_p_string+" "+p.text logging.info("All paragraphs of the abtract of the xml_file "+file_name +" are joined") wordcloud = WC(width=self.api_config.image.width,height=self.api_config.image.height,background_color=self.api_config.image.background).generate(all_p_string) logging.info("WordCloud of the file "+file_name +" is created") if self.api_config.image.cache: abs_path = os.path.abspath(self.api_config.image.cache_dir) file_dir = os.path.join(abs_path,self.api_config.grobid.operation_key) file_path = os.path.join(file_dir,file_name+self.api_config.image.format) if not os.path.exists(file_dir): os.makedirs(file_dir) wordcloud.to_file(file_path) logging.info("WordCloud png is store at "+file_path) return wordcloud
[docs] def get_len(self): return len(self.word_clouds)
[docs] def plot_cloud(self,cloud): try: plt.imshow(cloud, interpolation='bilinear') plt.axis("off") plt.show() except Exception as e: logging.error("Cloud image could not be shown: ",str(e))
[docs] def show_all_cloud(self): for index,cloud in enumerate(self.word_clouds): try: self.plot_cloud(cloud) except Exception as e: logging.error("Cloud with index "+index,"could not be shown: ",str(e))