Source code for pdf_analyzer.uploaders.file_uploader


from pdf_analyzer.logger import logging
import os
import glob
from bs4 import BeautifulSoup
from pdf_analyzer.api.grobid_client_python.grobid_client.grobid_client import GrobidClient
from typing import Callable, Dict, List, Optional, Union,IO

     

[docs]class FileUploader(): def __init__(self,data_dir:str): """_summary_ Initialize file_uploader, it will use the data_dir for file searching Args: data_dir (str): _description_. folder for searching files. """ self.data_dir = data_dir
[docs] def get_all_files_with_format(self,file_format:str,recursive_search=False): """_summary_ get all files with the specific format from data_dir , if recursive is set to True it will search all folder at the folder Args: file_format (str): _description_ recursive_search (bool, optional): _description_. Defaults to False. Raises: ValueError: _description_ Returns: _type_: _description_ """ if not os.path.exists(self.data_dir): os.makedirs(self.data_dir) if not os.path.exists(self.data_dir) or not os.path.isdir(self.data_dir): error_messege = "data_dir: "+self.data_dir+" is not a valid or existing directory" logging.error(error_messege) raise ValueError(error_messege) pattern = os.path.join(self.data_dir, f"**/*{file_format}") file_paths = glob.glob(pattern, recursive=recursive_search) return file_paths