import sys
import os
import shutil
import io
from html.parser import HTMLParser
import chardet
def detect_encoding(filename):
with open(filename, 'rb') as file:
raw = file.read()
return chardet.detect(raw)['encoding']
class HTMLChecker(HTMLParser):
def __init__(self):
super().__init__()
self.tags_stack = []
self.unmatched_tags = []
self.check_tags = {'html', 'head', 'body', 'h1', 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol'}
self.line_number = 1
self.column_number = 0
self.position = 0
self.content = ""
self.in_heading = False
self.current_heading = ""
self.links_in_headings = []
def feed(self, data):
self.content = data
super().feed(data)
def update_position(self, data):
lines = data.split('\n')
if len(lines) > 1:
self.line_number += len(lines) - 1
self.column_number = len(lines[-1])
else:
self.column_number += len(data)
self.position += len(data)
def handle_starttag(self, tag, attrs):
start_position = self.getpos()[0], self.getpos()[1], self.position
self.update_position(self.get_starttag_text())
tag = tag.lower()
if tag in self.check_tags:
self.tags_stack.append((tag, *start_position))
if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
self.in_heading = True
self.current_heading = tag
if self.in_heading and tag == 'a':
self.links_in_headings.append((self.current_heading, self.line_number, self.column_number))
def handle_endtag(self, tag):
end_position = self.getpos()[0], self.getpos()[1], self.position
tag = tag.lower()
if tag in self.check_tags:
if self.tags_stack and self.tags_stack[-1][0] == tag:
self.tags_stack.pop()
else:
self.unmatched_tags.append((f"Unexpected closing tag {tag}>", *end_position))
if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
self.in_heading = False
self.current_heading = ""
self.update_position(f"{tag}>")
def handle_data(self, data):
self.update_position(data)
def handle_comment(self, data):
self.update_position(f"")
def handle_decl(self, decl):
self.update_position(f"")
def handle_pi(self, data):
self.update_position(f"{data}>")
def check_unmatched(self):
for tag, start_line, start_col, start_pos in self.tags_stack:
end_line, end_col = self.getpos()
self.unmatched_tags.append((f"Unclosed tag <{tag}>", start_line, start_col, start_pos, end_line, end_col))
return sorted(self.unmatched_tags, key=lambda x: (x[1], x[2]))
def verify_tag_at_position(self, tag, line, col, pos):
lines = self.content.split('\n')
if 0 <= line - 1 < len(lines):
line_content = lines[line - 1].lower()
expected_tag = f"{tag.lower()}>"
return line_content[col:].startswith(expected_tag)
return False
def check_html_file(filename):
try:
encoding = detect_encoding(filename)
with io.open(filename, 'r', encoding=encoding) as file:
content = file.read()
except IOError as e:
print(f"Error: Unable to read file '{filename}'. {str(e)}")
sys.exit(1)
checker = HTMLChecker()
try:
checker.feed(content)
except Exception as e:
print(f"Error parsing HTML in file '{filename}': {str(e)}")
sys.exit(1)
unmatched = checker.check_unmatched()
full_path = os.path.abspath(filename)
if checker.links_in_headings:
print(f"\nLinks found in headings in file '{full_path}':")
for heading, line, col in checker.links_in_headings:
print(f" <{heading}> tag contains a link at line {line}, column {col}")
if unmatched:
modified_content = content
need_modification = False
for error, start_line, start_col, start_pos, end_line, end_col in unmatched:
print(f"{full_path}: {error} started at line {start_line}, column {start_col} and ended at line {end_line}, column {end_col}")
if error.lower().startswith("unexpected closing tag ") or error.lower().startswith("unexpected closing tag "):
tag_to_remove = "ul" if "ul" in error.lower() else "ol"
if checker.verify_tag_at_position(tag_to_remove, start_line, start_col, start_pos):
need_modification = True
lines = modified_content.split('\n')
line_lower = lines[start_line - 1].lower()
tag_lower = f"{tag_to_remove}>"
start_index = line_lower.index(tag_lower, start_col)
end_index = start_index + len(tag_lower)
lines[start_line - 1] = lines[start_line - 1][:start_index] + lines[start_line - 1][end_index:]
modified_content = '\n'.join(lines)
print(f"Removed unexpected {tag_to_remove}> tag (case insensitive) at line {start_line}, column {start_col}")
else:
print(f"Warning: Expected {tag_to_remove}> (case insensitive) at line {start_line}, column {start_col}, but it was not found or in incorrect format.")
if need_modification:
backup_path = os.path.join('/tmp', os.path.basename(filename))
try:
shutil.copy2(filename, backup_path)
print(f"Backup created at {backup_path}")
except IOError as e:
print(f"Error: Unable to create backup for file '{filename}'. {str(e)}")
sys.exit(1)
try:
with io.open(filename, 'w', encoding=encoding) as file:
file.write(modified_content)
print(f"File {filename} has been updated.")
except IOError as e:
print(f"Error: Unable to write to file '{filename}'. {str(e)}")
sys.exit(1)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python html_checker.py ")
sys.exit(1)
else:
check_html_file(sys.argv[1])