Files
mikrotik-adlist-builder/mikrotik-adlist-builder.py
2026-04-01 13:59:27 +02:00

243 lines
6.3 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import gzip
import io
import re
import sys
import urllib.request
from urllib.parse import urlparse, unquote
from pathlib import Path
from typing import Iterable, Optional, Set
DEFAULT_URLS = [
# Popular blocklists that often use ABP syntax
"https://big.oisd.nl/",
"https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts",
# Custom blocklists. These may contain ABP rules,
# but mainly plain domain syntax.
"./custom-domains.txt",
]
ABP_DOMAIN_RE = re.compile(r"^\|\|([A-Za-z0-9._-]+)\^$")
HOSTS_SPLIT_RE = re.compile(r"\s+")
VALID_DOMAIN_RE = re.compile(
r"^(?=.{1,253}$)(?!-)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9-]{2,63}\.?$",
re.IGNORECASE,
)
def download_text(url: str, timeout: int = 30) -> str:
parsed = urlparse(url)
# Local file via file://
if parsed.scheme == "file":
path = Path(unquote(parsed.path))
with open(path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
# Local file without scheme, e.g. ./list.txt or list.txt
if parsed.scheme == "":
path = Path(url)
with open(path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
# HTTP/HTTPS
req = urllib.request.Request(
url,
headers={
"User-Agent": "mikrotik-adlist-builder/1.0",
"Accept-Encoding": "gzip",
},
)
with urllib.request.urlopen(req, timeout=timeout) as response:
raw = response.read()
encoding = response.headers.get("Content-Encoding", "").lower()
if encoding == "gzip":
raw = gzip.decompress(raw)
else:
if len(raw) >= 2 and raw[:2] == b"\x1f\x8b":
raw = gzip.decompress(raw)
charset = response.headers.get_content_charset() or "utf-8"
return raw.decode(charset, errors="replace")
def normalize_domain(domain: str) -> Optional[str]:
domain = domain.strip().lower().rstrip(".")
if not domain:
return None
if domain in {"localhost", "local", "broadcasthost"}:
return None
if "/" in domain or "\\" in domain:
return None
if ":" in domain:
# Ignore IPv6, ports, and similar entries
return None
if domain.startswith("*."):
domain = domain[2:]
if not VALID_DOMAIN_RE.match(domain):
return None
return domain
def extract_from_abp_line(line: str) -> Optional[str]:
# Example: ||example.com^
m = ABP_DOMAIN_RE.match(line)
if m:
return normalize_domain(m.group(1))
# Some variants may omit the trailing ^
if line.startswith("||"):
candidate = line[2:]
for sep in ["^", "/", "$"]:
if sep in candidate:
candidate = candidate.split(sep, 1)[0]
return normalize_domain(candidate)
return None
def extract_from_hosts_line(line: str) -> Set[str]:
result: Set[str] = set()
# Remove inline comment
line = line.split("#", 1)[0].strip()
if not line:
return result
parts = HOSTS_SPLIT_RE.split(line)
if len(parts) < 2:
return result
first = parts[0].lower()
# Common hosts file IP prefixes
if first in {"0.0.0.0", "127.0.0.1", "::1", "::", "255.255.255.255"}:
for item in parts[1:]:
d = normalize_domain(item)
if d:
result.add(d)
return result
def extract_plain_domain(line: str) -> Optional[str]:
line = line.strip()
if not line:
return None
if line.startswith(("!", "#", "[")):
return None
if line.startswith("@@"):
# Ignore whitelist rules
return None
if line.startswith(("||", "|")):
return None
if any(x in line for x in [" ", "\t", "/", "^", "$"]):
return None
return normalize_domain(line)
def extract_domains(text: str) -> Set[str]:
domains: Set[str] = set()
for raw_line in io.StringIO(text):
line = raw_line.strip()
if not line:
continue
# Skip comments and metadata
if line.startswith(("!", "#", "[")):
continue
# Skip ABP whitelist rules
if line.startswith("@@"):
continue
# 1) ABP syntax
d = extract_from_abp_line(line)
if d:
domains.add(d)
continue
# 2) hosts file syntax
hosts_domains = extract_from_hosts_line(line)
if hosts_domains:
domains.update(hosts_domains)
continue
# 3) plain domain syntax
d = extract_plain_domain(line)
if d:
domains.add(d)
continue
return domains
def build_output(urls: Iterable[str], output_file: str) -> int:
all_domains: Set[str] = set()
for url in urls:
print(f"[INFO] Downloading: {url}", file=sys.stderr)
try:
text = download_text(url)
domains = extract_domains(text)
print(f"[INFO] Domains found: {len(domains)}", file=sys.stderr)
all_domains.update(domains)
except Exception as exc:
print(f"[ERROR] {url}: {exc}", file=sys.stderr)
sorted_domains = sorted(all_domains)
with open(output_file, "w", encoding="utf-8", newline="\n") as f:
for domain in sorted_domains:
f.write(f"0.0.0.0 {domain}\n")
return len(sorted_domains)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Download multiple blocklists from URLs and create one MikroTik adlist in '0.0.0.0 domain' format."
)
parser.add_argument(
"-u",
"--url",
action="append",
dest="urls",
help="Blocklist URL. Can be used multiple times.",
)
parser.add_argument(
"-o",
"--output",
default="adlist.txt",
help="Output file.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
urls = args.urls or DEFAULT_URLS
if not urls:
print("Error: no URLs were provided. Use -u URL or edit DEFAULT_URLS.", file=sys.stderr)
return 1
count = build_output(urls, args.output)
print(f"[OK] Output written to: {args.output}", file=sys.stderr)
print(f"[OK] Total unique domains: {count}", file=sys.stderr)
return 0
if __name__ == "__main__":
raise SystemExit(main())