#!/usr/bin/env python3 from __future__ import annotations import argparse import gzip import io import re import sys import urllib.request from urllib.parse import urlparse, unquote from pathlib import Path from typing import Iterable, Optional, Set DEFAULT_URLS = [ # Popular blocklists that often use ABP syntax "https://big.oisd.nl/", "https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts", # Custom blocklists. These may contain ABP rules, # but mainly plain domain syntax. "./custom-domains.txt", ] ABP_DOMAIN_RE = re.compile(r"^\|\|([A-Za-z0-9._-]+)\^$") HOSTS_SPLIT_RE = re.compile(r"\s+") VALID_DOMAIN_RE = re.compile( r"^(?=.{1,253}$)(?!-)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9-]{2,63}\.?$", re.IGNORECASE, ) def download_text(url: str, timeout: int = 30) -> str: parsed = urlparse(url) # Local file via file:// if parsed.scheme == "file": path = Path(unquote(parsed.path)) with open(path, "r", encoding="utf-8", errors="replace") as f: return f.read() # Local file without scheme, e.g. ./list.txt or list.txt if parsed.scheme == "": path = Path(url) with open(path, "r", encoding="utf-8", errors="replace") as f: return f.read() # HTTP/HTTPS req = urllib.request.Request( url, headers={ "User-Agent": "mikrotik-adlist-builder/1.0", "Accept-Encoding": "gzip", }, ) with urllib.request.urlopen(req, timeout=timeout) as response: raw = response.read() encoding = response.headers.get("Content-Encoding", "").lower() if encoding == "gzip": raw = gzip.decompress(raw) else: if len(raw) >= 2 and raw[:2] == b"\x1f\x8b": raw = gzip.decompress(raw) charset = response.headers.get_content_charset() or "utf-8" return raw.decode(charset, errors="replace") def normalize_domain(domain: str) -> Optional[str]: domain = domain.strip().lower().rstrip(".") if not domain: return None if domain in {"localhost", "local", "broadcasthost"}: return None if "/" in domain or "\\" in domain: return None if ":" in domain: # Ignore IPv6, ports, and similar entries return None if domain.startswith("*."): domain = domain[2:] if not VALID_DOMAIN_RE.match(domain): return None return domain def extract_from_abp_line(line: str) -> Optional[str]: # Example: ||example.com^ m = ABP_DOMAIN_RE.match(line) if m: return normalize_domain(m.group(1)) # Some variants may omit the trailing ^ if line.startswith("||"): candidate = line[2:] for sep in ["^", "/", "$"]: if sep in candidate: candidate = candidate.split(sep, 1)[0] return normalize_domain(candidate) return None def extract_from_hosts_line(line: str) -> Set[str]: result: Set[str] = set() # Remove inline comment line = line.split("#", 1)[0].strip() if not line: return result parts = HOSTS_SPLIT_RE.split(line) if len(parts) < 2: return result first = parts[0].lower() # Common hosts file IP prefixes if first in {"0.0.0.0", "127.0.0.1", "::1", "::", "255.255.255.255"}: for item in parts[1:]: d = normalize_domain(item) if d: result.add(d) return result def extract_plain_domain(line: str) -> Optional[str]: line = line.strip() if not line: return None if line.startswith(("!", "#", "[")): return None if line.startswith("@@"): # Ignore whitelist rules return None if line.startswith(("||", "|")): return None if any(x in line for x in [" ", "\t", "/", "^", "$"]): return None return normalize_domain(line) def extract_domains(text: str) -> Set[str]: domains: Set[str] = set() for raw_line in io.StringIO(text): line = raw_line.strip() if not line: continue # Skip comments and metadata if line.startswith(("!", "#", "[")): continue # Skip ABP whitelist rules if line.startswith("@@"): continue # 1) ABP syntax d = extract_from_abp_line(line) if d: domains.add(d) continue # 2) hosts file syntax hosts_domains = extract_from_hosts_line(line) if hosts_domains: domains.update(hosts_domains) continue # 3) plain domain syntax d = extract_plain_domain(line) if d: domains.add(d) continue return domains def build_output(urls: Iterable[str], output_file: str) -> int: all_domains: Set[str] = set() for url in urls: print(f"[INFO] Downloading: {url}", file=sys.stderr) try: text = download_text(url) domains = extract_domains(text) print(f"[INFO] Domains found: {len(domains)}", file=sys.stderr) all_domains.update(domains) except Exception as exc: print(f"[ERROR] {url}: {exc}", file=sys.stderr) sorted_domains = sorted(all_domains) with open(output_file, "w", encoding="utf-8", newline="\n") as f: for domain in sorted_domains: f.write(f"0.0.0.0 {domain}\n") return len(sorted_domains) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Download multiple blocklists from URLs and create one MikroTik adlist in '0.0.0.0 domain' format." ) parser.add_argument( "-u", "--url", action="append", dest="urls", help="Blocklist URL. Can be used multiple times.", ) parser.add_argument( "-o", "--output", default="adlist.txt", help="Output file.", ) return parser.parse_args() def main() -> int: args = parse_args() urls = args.urls or DEFAULT_URLS if not urls: print("Error: no URLs were provided. Use -u URL or edit DEFAULT_URLS.", file=sys.stderr) return 1 count = build_output(urls, args.output) print(f"[OK] Output written to: {args.output}", file=sys.stderr) print(f"[OK] Total unique domains: {count}", file=sys.stderr) return 0 if __name__ == "__main__": raise SystemExit(main())