import re import argparse from pathlib import Path from typing import List, Dict, Optional, Set from dataclasses import dataclass from collections import Counter from pdfminer.high_level import extract_pages from pdfminer.layout import LTPage, LTTextBox, LTTextLine, LTChar, LAParams from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument @dataclass class TextStyle: """テキストのスタイル情報を保持するクラス""" indent_level: int = 0 heading_level: int = 0 font_size: float = 0 is_link: bool = False class PDFToMarkdownConverter: def __init__(self): self.bullet_patterns = [ r'^[•・○●◆◇□△▲▽▼]\s+', r'^\s*[•・○●◆◇□△▲▽▼]\s+', r'^\s*[--*]\s+' ] self.numbered_patterns = [ r'^(\d+)[\.)]\s+', # 1. や 1) r'^[①-⑳]\s+', # 丸数字 r'^([A-Za-z])[\.)]\s+', # a. や A) r'^(\s*\d+\s*)' # (1) 形式 ] self.base_font_size = 0 self.error_count = Counter() self.is_powerpoint = False def _detect_pdf_source(self, pdf_path: str) -> None: """PDFの作成元を判定する""" try: with open(pdf_path, 'rb') as file: parser = PDFParser(file) doc = PDFDocument(parser) if doc.info: producer = doc.info[0].get('Producer', b'').decode('utf-16', errors='ignore').lower().strip() self.is_powerpoint = "powerpoint" in producer print(f"Debug - Direct check 'powerpoint' in '{producer}': {self.is_powerpoint}") except Exception as e: print(f"Debug - Error: {str(e)}") self.error_count['producer_detection_error'] += 1 self.is_powerpoint = False def convert(self, pdf_path: str, output_path: str) -> Dict: """PDFファイルをマークダウンに変換する""" markdown_content = [] stats = {'total_pages': 0, 'processed_elements': 0} try: # PDF作成元の判定 self._detect_pdf_source(pdf_path) laparams = LAParams( line_margin=0.5, word_margin=0.1, char_margin=2.0, boxes_flow=0.5, detect_vertical=True ) self._detect_base_font_size(pdf_path, laparams) for page_num, page in enumerate(extract_pages(pdf_path, laparams=laparams), 1): try: markdown_content.append(f"\n") elements = self._extract_elements(page) stats['processed_elements'] += len(elements) processed_text = self._process_elements(elements) markdown_content.append(processed_text) stats['total_pages'] = page_num except Exception as e: self.error_count[str(e)] += 1 output_path = Path(output_path) output_path.write_text('\n'.join(markdown_content), encoding='utf-8') if self.error_count: stats['errors'] = dict(self.error_count) stats['source'] = 'PowerPoint' if self.is_powerpoint else 'Other' return stats except Exception as e: raise RuntimeError(f"PDF変換エラー: {str(e)}") def _detect_heading_level(self, text: str, font_size: float, y_position: float = 0, page_height: float = 0) -> int: """見出しレベルを判定する(PowerPointとWordで異なる基準を適用)""" # パターンマッチによる判定(文書種別によらず共通) if re.match(r"^第\s*[0-90-9]+\s*章", text): return 1 if re.match(r"^[0-90-9]+\.[0-90-9]+\s+", text): return 2 # 2文字以下の数値のみの場合は見出しとしない if re.match(r"^\d{1,2}$", text.strip()): return 0 # フォントサイズによる判定 if self.base_font_size <= 0 or font_size <= 0: return 0 size_ratio = font_size / self.base_font_size if self.is_powerpoint: if y_position >= 0.75 * page_height and size_ratio >= 1.19: return 2 # スライドタイトルは ## とする elif size_ratio >= 2: return 2 # 大きなテキストも ## とする else: # Word等の場合の判定基準 if size_ratio >= 1.5: return 1 elif size_ratio >= 1.3: return 2 elif size_ratio >= 1.1: return 3 return 0 def _detect_base_font_size(self, pdf_path: str, laparams: LAParams) -> None: """文書の基準フォントサイズを検出する""" font_sizes = [] for page in extract_pages(pdf_path, maxpages=2, laparams=laparams): for element in page: if isinstance(element, LTTextBox): for text_line in element: for char in text_line: if isinstance(char, LTChar): font_sizes.append(char.size) if font_sizes: # 最頻値を基準フォントサイズとする self.base_font_size = max(set(font_sizes), key=font_sizes.count) def _extract_elements(self, page: LTPage) -> List[Dict]: """ページからテキスト要素とスタイル情報を抽出する""" elements = [] x_positions = [] text_boxes = [] # URLを検出するためのセット links = set() for element in page: if isinstance(element, LTTextBox): x_positions.append(element.x0) text_boxes.append(element) indent_bases = self._calculate_indent_bases(sorted(x_positions), page.width) # テキスト要素の処理(ソート順序を安定化) sorted_boxes = sorted(text_boxes, key=lambda x: (-x.y1, x.x0, id(x))) for textbox in sorted_boxes: text = textbox.get_text().strip() if not text: continue # フォントサイズの取得(最初の文字から) font_size = 0 for line in textbox: for char in line: if isinstance(char, LTChar): font_size = char.size break if font_size: break style = TextStyle( indent_level=self._calculate_indent_level( textbox.x0, indent_bases, page.width), font_size=font_size, is_link=bool(re.search(r'https?://[^\s<>"]+|www\.[^\s<>"]+', text)) ) # 見出しレベルの判定 style.heading_level = self._detect_heading_level( text, font_size, textbox.y0, page.height) elements.append({'text': text, 'style': style}) return elements def _calculate_indent_bases(self, x_positions: List[float], page_width: float) -> List[float]: """インデントレベルの基準を計算する""" if not x_positions: return [] tolerance = min(page_width * 0.015, 8) clusters = [[x_positions[0]]] for x in x_positions[1:]: if x - clusters[-1][-1] <= tolerance: clusters[-1].append(x) else: clusters.append([x]) return [sum(cluster) / len(cluster) for cluster in clusters] def _calculate_indent_level(self, x_position: float, indent_bases: List[float], page_width: float) -> int: """インデントレベルを計算する""" if not indent_bases: return 0 tolerance = min(page_width * 0.008, 4) for level, base in enumerate(indent_bases): if abs(x_position - base) <= tolerance: return level return 0 def _is_bullet_point(self, text: str) -> bool: """箇条書きかどうかを判定する""" return any(re.match(pattern, text) for pattern in self.bullet_patterns) def _is_numbered_list(self, text: str) -> bool: """番号付きリストかどうかを判定する""" return any(re.match(pattern, text) for pattern in self.numbered_patterns) def _process_bullet_point(self, text: str) -> str: """箇条書きを処理する""" for pattern in self.bullet_patterns: text = re.sub(pattern, '', text) return f"- {text.strip()}" def _process_numbered_list(self, text: str) -> str: """番号付きリストを処理する""" for pattern in self.numbered_patterns: match = re.match(pattern, text) if match: number = match.group(1) if match.groups() else "1" if not number.isdigit(): number = str(ord(number.lower()) - ord('a') + 1) text = re.sub(pattern, '', text) return f"{number}. {text.strip()}" return text def _process_elements(self, elements: List[Dict]) -> str: """要素をマークダウンに変換する""" markdown_lines = [] for element in elements: text = element['text'] style = element['style'] if self._is_bullet_point(text): text = self._process_bullet_point(text) elif self._is_numbered_list(text): text = self._process_numbered_list(text) elif style.heading_level > 0: text = '#' * style.heading_level + ' ' + text if style.indent_level > 0 and not text.startswith(('#', '-', '1')): text = ' ' * style.indent_level + text if style.is_link: urls = re.findall(r'https?://[^\s<>"]+|www\.[^\s<>"]+', text) for url in urls: if not url.startswith(('http://', 'https://')): url = 'https://' + url text = text.replace(url, f'<{url}>') markdown_lines.append(text) return '\n\n'.join(markdown_lines) def parse_arguments(): parser = argparse.ArgumentParser(description='Convert various file formats to Markdown') parser.add_argument('input_file', help='Input file path') parser.add_argument('-o', '--output', help='Output markdown file path') parser.add_argument('--verbose', action='store_true', help='Enable verbose output') return parser.parse_args() if __name__ == "__main__": args = parse_arguments() converter = PDFToMarkdownConverter() # 出力ファイル名が指定されていない場合,入力ファイル名から生成する output_file = args.output or Path(args.input_file).with_suffix('.md') # 変換を実行 stats = converter.convert(args.input_file, output_file) # 詳細な出力が要求された場合 if args.verbose: print(f"変換統計: {stats}")