rm -f /tmp/link
linkchecker --no-warnings https://www.kkaneko.jp/ | tee -a /tmp/link
linkchecker --file-output text --no-warnings https://www.kkaneko.jp/


import sys
import os
import shutil
import io
from html.parser import HTMLParser
import chardet

def detect_encoding(filename):
    with open(filename, 'rb') as file:
        raw = file.read()
    return chardet.detect(raw)['encoding']

class HTMLChecker(HTMLParser):
    def __init__(self):
        super().__init__()
        self.tags_stack = []
        self.unmatched_tags = []
        self.check_tags = {'html', 'head', 'body', 'h1', 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol'}
        self.line_number = 1
        self.column_number = 0
        self.position = 0
        self.content = ""
        self.in_heading = False
        self.current_heading = ""
        self.links_in_headings = []

    def feed(self, data):
        self.content = data
        super().feed(data)

    def update_position(self, data):
        lines = data.split('\n')
        if len(lines) > 1:
            self.line_number += len(lines) - 1
            self.column_number = len(lines[-1])
        else:
            self.column_number += len(data)
        self.position += len(data)

    def handle_starttag(self, tag, attrs):
        start_position = self.getpos()[0], self.getpos()[1], self.position
        self.update_position(self.get_starttag_text())
        tag = tag.lower()
        if tag in self.check_tags:
            self.tags_stack.append((tag, *start_position))
        
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
            self.in_heading = True
            self.current_heading = tag
        
        if self.in_heading and tag == 'a':
            self.links_in_headings.append((self.current_heading, self.line_number, self.column_number))

    def handle_endtag(self, tag):
        end_position = self.getpos()[0], self.getpos()[1], self.position
        tag = tag.lower()
        if tag in self.check_tags:
            if self.tags_stack and self.tags_stack[-1][0] == tag:
                self.tags_stack.pop()
            else:
                self.unmatched_tags.append((f"Unexpected closing tag ", *end_position))
        
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
            self.in_heading = False
            self.current_heading = ""
        
        self.update_position(f"")

    def handle_data(self, data):
        self.update_position(data)

    def handle_comment(self, data):
        self.update_position(f"")

    def handle_decl(self, decl):
        self.update_position(f"")

    def handle_pi(self, data):
        self.update_position(f"")

    def check_unmatched(self):
        for tag, start_line, start_col, start_pos in self.tags_stack:
            end_line, end_col = self.getpos()
            self.unmatched_tags.append((f"Unclosed tag <{tag}>", start_line, start_col, start_pos, end_line, end_col))
        return sorted(self.unmatched_tags, key=lambda x: (x[1], x[2]))

    def verify_tag_at_position(self, tag, line, col, pos):
        lines = self.content.split('\n')
        if 0 <= line - 1 < len(lines):
            line_content = lines[line - 1].lower()
            expected_tag = f""
            return line_content[col:].startswith(expected_tag)
        return False

def check_html_file(filename):
    try:
        encoding = detect_encoding(filename)
        with io.open(filename, 'r', encoding=encoding) as file:
            content = file.read()
    except IOError as e:
        print(f"Error: Unable to read file '{filename}'. {str(e)}")
        sys.exit(1)

    checker = HTMLChecker()
    try:
        checker.feed(content)
    except Exception as e:
        print(f"Error parsing HTML in file '{filename}': {str(e)}")
        sys.exit(1)

    unmatched = checker.check_unmatched()
    full_path = os.path.abspath(filename)
    
    if checker.links_in_headings:
        print(f"\nLinks found in headings in file '{full_path}':")
        for heading, line, col in checker.links_in_headings:
            print(f"  <{heading}> tag contains a link at line {line}, column {col}")

    if unmatched:
        modified_content = content
        need_modification = False
        for error, start_line, start_col, start_pos, end_line, end_col in unmatched:
            print(f"{full_path}: {error} started at line {start_line}, column {start_col} and ended at line {end_line}, column {end_col}")
            if error.lower().startswith("unexpected closing tag ") or error.lower().startswith("unexpected closing tag "):
                tag_to_remove = "ul" if "ul" in error.lower() else "ol"
                if checker.verify_tag_at_position(tag_to_remove, start_line, start_col, start_pos):
                    need_modification = True
                    lines = modified_content.split('\n')
                    line_lower = lines[start_line - 1].lower()
                    tag_lower = f""
                    start_index = line_lower.index(tag_lower, start_col)
                    end_index = start_index + len(tag_lower)
                    lines[start_line - 1] = lines[start_line - 1][:start_index] + lines[start_line - 1][end_index:]
                    modified_content = '\n'.join(lines)
                    print(f"Removed unexpected  tag (case insensitive) at line {start_line}, column {start_col}")
                else:
                    print(f"Warning: Expected  (case insensitive) at line {start_line}, column {start_col}, but it was not found or in incorrect format.")

        if need_modification:
            backup_path = os.path.join('/tmp', os.path.basename(filename))
            try:
                shutil.copy2(filename, backup_path)
                print(f"Backup created at {backup_path}")
            except IOError as e:
                print(f"Error: Unable to create backup for file '{filename}'. {str(e)}")
                sys.exit(1)

            try:
                with io.open(filename, 'w', encoding=encoding) as file:
                    file.write(modified_content)
                print(f"File {filename} has been updated.")
            except IOError as e:
                print(f"Error: Unable to write to file '{filename}'. {str(e)}")
                sys.exit(1)

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python html_checker.py ")
        sys.exit(1)
    else:
        check_html_file(sys.argv[1])