243 lines
6.3 KiB
Python
243 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import gzip
|
|
import io
|
|
import re
|
|
import sys
|
|
import urllib.request
|
|
|
|
from urllib.parse import urlparse, unquote
|
|
from pathlib import Path
|
|
from typing import Iterable, Optional, Set
|
|
|
|
DEFAULT_URLS = [
|
|
# Popular blocklists that often use ABP syntax
|
|
"https://big.oisd.nl/",
|
|
"https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts",
|
|
|
|
# Custom blocklists. These may contain ABP rules,
|
|
# but mainly plain domain syntax.
|
|
"./custom-domains.txt",
|
|
]
|
|
|
|
ABP_DOMAIN_RE = re.compile(r"^\|\|([A-Za-z0-9._-]+)\^$")
|
|
HOSTS_SPLIT_RE = re.compile(r"\s+")
|
|
VALID_DOMAIN_RE = re.compile(
|
|
r"^(?=.{1,253}$)(?!-)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9-]{2,63}\.?$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
def download_text(url: str, timeout: int = 30) -> str:
|
|
parsed = urlparse(url)
|
|
|
|
# Local file via file://
|
|
if parsed.scheme == "file":
|
|
path = Path(unquote(parsed.path))
|
|
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
return f.read()
|
|
|
|
# Local file without scheme, e.g. ./list.txt or list.txt
|
|
if parsed.scheme == "":
|
|
path = Path(url)
|
|
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
return f.read()
|
|
|
|
# HTTP/HTTPS
|
|
req = urllib.request.Request(
|
|
url,
|
|
headers={
|
|
"User-Agent": "mikrotik-adlist-builder/1.0",
|
|
"Accept-Encoding": "gzip",
|
|
},
|
|
)
|
|
|
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
|
raw = response.read()
|
|
encoding = response.headers.get("Content-Encoding", "").lower()
|
|
|
|
if encoding == "gzip":
|
|
raw = gzip.decompress(raw)
|
|
else:
|
|
if len(raw) >= 2 and raw[:2] == b"\x1f\x8b":
|
|
raw = gzip.decompress(raw)
|
|
|
|
charset = response.headers.get_content_charset() or "utf-8"
|
|
return raw.decode(charset, errors="replace")
|
|
|
|
def normalize_domain(domain: str) -> Optional[str]:
|
|
domain = domain.strip().lower().rstrip(".")
|
|
if not domain:
|
|
return None
|
|
|
|
if domain in {"localhost", "local", "broadcasthost"}:
|
|
return None
|
|
|
|
if "/" in domain or "\\" in domain:
|
|
return None
|
|
|
|
if ":" in domain:
|
|
# Ignore IPv6, ports, and similar entries
|
|
return None
|
|
|
|
if domain.startswith("*."):
|
|
domain = domain[2:]
|
|
|
|
if not VALID_DOMAIN_RE.match(domain):
|
|
return None
|
|
|
|
return domain
|
|
|
|
def extract_from_abp_line(line: str) -> Optional[str]:
|
|
# Example: ||example.com^
|
|
m = ABP_DOMAIN_RE.match(line)
|
|
if m:
|
|
return normalize_domain(m.group(1))
|
|
|
|
# Some variants may omit the trailing ^
|
|
if line.startswith("||"):
|
|
candidate = line[2:]
|
|
for sep in ["^", "/", "$"]:
|
|
if sep in candidate:
|
|
candidate = candidate.split(sep, 1)[0]
|
|
return normalize_domain(candidate)
|
|
|
|
return None
|
|
|
|
def extract_from_hosts_line(line: str) -> Set[str]:
|
|
result: Set[str] = set()
|
|
|
|
# Remove inline comment
|
|
line = line.split("#", 1)[0].strip()
|
|
if not line:
|
|
return result
|
|
|
|
parts = HOSTS_SPLIT_RE.split(line)
|
|
if len(parts) < 2:
|
|
return result
|
|
|
|
first = parts[0].lower()
|
|
|
|
# Common hosts file IP prefixes
|
|
if first in {"0.0.0.0", "127.0.0.1", "::1", "::", "255.255.255.255"}:
|
|
for item in parts[1:]:
|
|
d = normalize_domain(item)
|
|
if d:
|
|
result.add(d)
|
|
|
|
return result
|
|
|
|
def extract_plain_domain(line: str) -> Optional[str]:
|
|
line = line.strip()
|
|
if not line:
|
|
return None
|
|
|
|
if line.startswith(("!", "#", "[")):
|
|
return None
|
|
|
|
if line.startswith("@@"):
|
|
# Ignore whitelist rules
|
|
return None
|
|
|
|
if line.startswith(("||", "|")):
|
|
return None
|
|
|
|
if any(x in line for x in [" ", "\t", "/", "^", "$"]):
|
|
return None
|
|
|
|
return normalize_domain(line)
|
|
|
|
def extract_domains(text: str) -> Set[str]:
|
|
domains: Set[str] = set()
|
|
|
|
for raw_line in io.StringIO(text):
|
|
line = raw_line.strip()
|
|
|
|
if not line:
|
|
continue
|
|
|
|
# Skip comments and metadata
|
|
if line.startswith(("!", "#", "[")):
|
|
continue
|
|
|
|
# Skip ABP whitelist rules
|
|
if line.startswith("@@"):
|
|
continue
|
|
|
|
# 1) ABP syntax
|
|
d = extract_from_abp_line(line)
|
|
if d:
|
|
domains.add(d)
|
|
continue
|
|
|
|
# 2) hosts file syntax
|
|
hosts_domains = extract_from_hosts_line(line)
|
|
if hosts_domains:
|
|
domains.update(hosts_domains)
|
|
continue
|
|
|
|
# 3) plain domain syntax
|
|
d = extract_plain_domain(line)
|
|
if d:
|
|
domains.add(d)
|
|
continue
|
|
|
|
return domains
|
|
|
|
def build_output(urls: Iterable[str], output_file: str) -> int:
|
|
all_domains: Set[str] = set()
|
|
|
|
for url in urls:
|
|
print(f"[INFO] Downloading: {url}", file=sys.stderr)
|
|
try:
|
|
text = download_text(url)
|
|
domains = extract_domains(text)
|
|
print(f"[INFO] Domains found: {len(domains)}", file=sys.stderr)
|
|
all_domains.update(domains)
|
|
except Exception as exc:
|
|
print(f"[ERROR] {url}: {exc}", file=sys.stderr)
|
|
|
|
sorted_domains = sorted(all_domains)
|
|
|
|
with open(output_file, "w", encoding="utf-8", newline="\n") as f:
|
|
for domain in sorted_domains:
|
|
f.write(f"0.0.0.0 {domain}\n")
|
|
|
|
return len(sorted_domains)
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download multiple blocklists from URLs and create one MikroTik adlist in '0.0.0.0 domain' format."
|
|
)
|
|
parser.add_argument(
|
|
"-u",
|
|
"--url",
|
|
action="append",
|
|
dest="urls",
|
|
help="Blocklist URL. Can be used multiple times.",
|
|
)
|
|
parser.add_argument(
|
|
"-o",
|
|
"--output",
|
|
default="adlist.txt",
|
|
help="Output file.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
urls = args.urls or DEFAULT_URLS
|
|
|
|
if not urls:
|
|
print("Error: no URLs were provided. Use -u URL or edit DEFAULT_URLS.", file=sys.stderr)
|
|
return 1
|
|
|
|
count = build_output(urls, args.output)
|
|
print(f"[OK] Output written to: {args.output}", file=sys.stderr)
|
|
print(f"[OK] Total unique domains: {count}", file=sys.stderr)
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main()) |