Initial commit

2026-04-01 13:59:27 +02:00
parent 79cb7f70aa
commit c0fc949006
3 changed files with 466 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,223 @@
-# mikrotik-adlist-builder
+# Mikrotik Adlist Builder

-Tool for building mikrotik adlists for blocking ands and harmful domains.
+`mikrotik-adlist-builder` is a small Python tool for building MikroTik adlists used to block ads and harmful domains.
+
+It can download multiple blocklists from remote URLs, read local files, extract valid domain names from different formats, merge them, remove duplicates, and write the final output in a MikroTik-friendly format:
+
+```text
+0.0.0.0 example.com
+0.0.0.0 ads.example.net
+```
+
+## Features
+
+- Supports multiple input sources
+- Downloads blocklists from `http://` and `https://` URLs
+- Reads local files from:
+  - relative paths such as `./custom-domains.txt`
+  - absolute paths
+  - `file://` URLs
+- Supports multiple common blocklist formats:
+  - ABP-style rules such as `||example.com^`
+  - hosts file syntax such as `0.0.0.0 example.com`
+  - plain domain lists such as `example.com`
+- Removes duplicates automatically
+- Filters out invalid entries
+- Writes a merged output file ready for MikroTik adlist import
+
+## Requirements
+
+- Python 3.9 or newer
+
+## Installation
+
+No external dependencies are required.
+
+Clone the repository or just save the script locally:
+
+```bash
+chmod +x mikrotik-adlist-builder.py
+```
+
+You can then run it directly:
+
+```bash
+./mikrotik-adlist-builder.py
+```
+
+Or with Python:
+
+```bash
+python3 mikrotik-adlist-builder.py
+```
+
+## Default sources
+
+The script includes a built-in `DEFAULT_URLS` list. Example:
+
+```python
+DEFAULT_URLS = [
+    "https://big.oisd.nl/",
+    "https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts",
+    "./custom-domains.txt",
+]
+```
+
+This means the tool can combine public online blocklists with your own local domain list.
+
+## Usage
+
+### Use default sources
+
+```bash
+python3 mikrotik-adlist-builder.py
+```
+
+This will create:
+
+```text
+adlist.txt
+```
+
+### Specify custom URLs
+
+```bash
+python3 mikrotik-adlist-builder.py \
+  -u https://big.oisd.nl/ \
+  -u https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts
+```
+
+### Mix remote and local sources
+
+```bash
+python3 mikrotik-adlist-builder.py \
+  -u https://big.oisd.nl/ \
+  -u ./custom-domains.txt \
+  -u ./my-extra-list.txt
+```
+
+### Use a local file via `file://`
+
+```bash
+python3 mikrotik-adlist-builder.py \
+  -u file:///home/user/blocklists/custom.txt
+```
+
+### Change output file
+
+```bash
+python3 mikrotik-adlist-builder.py \
+  -o mikrotik-adlist.txt
+```
+
+### Full example
+
+```bash
+python3 mikrotik-adlist-builder.py \
+  -u https://big.oisd.nl/ \
+  -u https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts \
+  -u ./custom-domains.txt \
+  -o mikrotik-adlist.txt
+```
+
+## Supported input formats
+
+### 1. ABP syntax
+
+Example:
+
+```text
+||example.com^
+||ads.example.net^
+```
+
+Some simplified variants are also accepted, for example:
+
+```text
+||example.com
+```
+
+### 2. Hosts syntax
+
+Example:
+
+```text
+0.0.0.0 example.com
+127.0.0.1 ads.example.net
+```
+
+### 3. Plain domain syntax
+
+Example:
+
+```text
+example.com
+ads.example.net
+tracker.example.org
+```
+
+## Local custom domain file example
+
+Example `custom-domains.txt`:
+
+```text
+example-bad-site.com
+ads.example.net
+tracker.example.org
+```
+
+You can also mix in hosts-style entries:
+
+```text
+0.0.0.0 bad.example.com
+127.0.0.1 ads.badsite.net
+```
+
+And ABP-style rules:
+
+```text
+||tracker.example.org^
+||ads.example.net^
+```
+
+## Output format
+
+The generated file contains one domain per line in this format:
+
+```text
+0.0.0.0 domain.tld
+```
+
+Example:
+
+```text
+0.0.0.0 ads.example.com
+0.0.0.0 tracker.example.net
+0.0.0.0 malware.example.org
+```
+
+## Import into MikroTik
+
+The resulting file is intended to be used as a source for a MikroTik adlist or for further processing before import, depending on your RouterOS version and setup.
+
+## Notes
+
+- Relative local paths are resolved against the current working directory from which you run the script.
+- `file://` paths should normally be absolute.
+- Duplicate domains are removed automatically.
+- Invalid lines, comments, whitelist rules, localhost-style entries, IPv6 entries, and malformed domains are ignored.
+
+## Example output messages
+
+```text
+[INFO] Downloading: https://big.oisd.nl/
+[INFO] Domains found: 123456
+[INFO] Downloading: ./custom-domains.txt
+[INFO] Domains found: 25
+[OK] Output written to: adlist.txt
+[OK] Total unique domains: 123470
+```
+
+## License
+
+Use, modify, and distribute freely as needed.
--- a/custom-domains.txt
+++ b/custom-domains.txt
@@ -0,0 +1 @@
+0.0.0.0 ssp.seznam.cz
--- a/mikrotik-adlist-builder.py
+++ b/mikrotik-adlist-builder.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import gzip
+import io
+import re
+import sys
+import urllib.request
+
+from urllib.parse import urlparse, unquote
+from pathlib import Path
+from typing import Iterable, Optional, Set
+
+DEFAULT_URLS = [
+    # Popular blocklists that often use ABP syntax
+    "https://big.oisd.nl/",
+    "https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts",
+
+    # Custom blocklists. These may contain ABP rules,
+    # but mainly plain domain syntax.
+    "./custom-domains.txt",
+]
+
+ABP_DOMAIN_RE = re.compile(r"^\|\|([A-Za-z0-9._-]+)\^$")
+HOSTS_SPLIT_RE = re.compile(r"\s+")
+VALID_DOMAIN_RE = re.compile(
+    r"^(?=.{1,253}$)(?!-)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9-]{2,63}\.?$",
+    re.IGNORECASE,
+)
+
+def download_text(url: str, timeout: int = 30) -> str:
+    parsed = urlparse(url)
+
+    # Local file via file://
+    if parsed.scheme == "file":
+        path = Path(unquote(parsed.path))
+        with open(path, "r", encoding="utf-8", errors="replace") as f:
+            return f.read()
+
+    # Local file without scheme, e.g. ./list.txt or list.txt
+    if parsed.scheme == "":
+        path = Path(url)
+        with open(path, "r", encoding="utf-8", errors="replace") as f:
+            return f.read()
+
+    # HTTP/HTTPS
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": "mikrotik-adlist-builder/1.0",
+            "Accept-Encoding": "gzip",
+        },
+    )
+
+    with urllib.request.urlopen(req, timeout=timeout) as response:
+        raw = response.read()
+        encoding = response.headers.get("Content-Encoding", "").lower()
+
+        if encoding == "gzip":
+            raw = gzip.decompress(raw)
+        else:
+            if len(raw) >= 2 and raw[:2] == b"\x1f\x8b":
+                raw = gzip.decompress(raw)
+
+        charset = response.headers.get_content_charset() or "utf-8"
+        return raw.decode(charset, errors="replace")
+
+def normalize_domain(domain: str) -> Optional[str]:
+    domain = domain.strip().lower().rstrip(".")
+    if not domain:
+        return None
+
+    if domain in {"localhost", "local", "broadcasthost"}:
+        return None
+
+    if "/" in domain or "\\" in domain:
+        return None
+
+    if ":" in domain:
+        # Ignore IPv6, ports, and similar entries
+        return None
+
+    if domain.startswith("*."):
+        domain = domain[2:]
+
+    if not VALID_DOMAIN_RE.match(domain):
+        return None
+
+    return domain
+
+def extract_from_abp_line(line: str) -> Optional[str]:
+    # Example: ||example.com^
+    m = ABP_DOMAIN_RE.match(line)
+    if m:
+        return normalize_domain(m.group(1))
+
+    # Some variants may omit the trailing ^
+    if line.startswith("||"):
+        candidate = line[2:]
+        for sep in ["^", "/", "$"]:
+            if sep in candidate:
+                candidate = candidate.split(sep, 1)[0]
+        return normalize_domain(candidate)
+
+    return None
+
+def extract_from_hosts_line(line: str) -> Set[str]:
+    result: Set[str] = set()
+
+    # Remove inline comment
+    line = line.split("#", 1)[0].strip()
+    if not line:
+        return result
+
+    parts = HOSTS_SPLIT_RE.split(line)
+    if len(parts) < 2:
+        return result
+
+    first = parts[0].lower()
+
+    # Common hosts file IP prefixes
+    if first in {"0.0.0.0", "127.0.0.1", "::1", "::", "255.255.255.255"}:
+        for item in parts[1:]:
+            d = normalize_domain(item)
+            if d:
+                result.add(d)
+
+    return result
+
+def extract_plain_domain(line: str) -> Optional[str]:
+    line = line.strip()
+    if not line:
+        return None
+
+    if line.startswith(("!", "#", "[")):
+        return None
+
+    if line.startswith("@@"):
+        # Ignore whitelist rules
+        return None
+
+    if line.startswith(("||", "|")):
+        return None
+
+    if any(x in line for x in [" ", "\t", "/", "^", "$"]):
+        return None
+
+    return normalize_domain(line)
+
+def extract_domains(text: str) -> Set[str]:
+    domains: Set[str] = set()
+
+    for raw_line in io.StringIO(text):
+        line = raw_line.strip()
+
+        if not line:
+            continue
+
+        # Skip comments and metadata
+        if line.startswith(("!", "#", "[")):
+            continue
+
+        # Skip ABP whitelist rules
+        if line.startswith("@@"):
+            continue
+
+        # 1) ABP syntax
+        d = extract_from_abp_line(line)
+        if d:
+            domains.add(d)
+            continue
+
+        # 2) hosts file syntax
+        hosts_domains = extract_from_hosts_line(line)
+        if hosts_domains:
+            domains.update(hosts_domains)
+            continue
+
+        # 3) plain domain syntax
+        d = extract_plain_domain(line)
+        if d:
+            domains.add(d)
+            continue
+
+    return domains
+
+def build_output(urls: Iterable[str], output_file: str) -> int:
+    all_domains: Set[str] = set()
+
+    for url in urls:
+        print(f"[INFO] Downloading: {url}", file=sys.stderr)
+        try:
+            text = download_text(url)
+            domains = extract_domains(text)
+            print(f"[INFO] Domains found: {len(domains)}", file=sys.stderr)
+            all_domains.update(domains)
+        except Exception as exc:
+            print(f"[ERROR] {url}: {exc}", file=sys.stderr)
+
+    sorted_domains = sorted(all_domains)
+
+    with open(output_file, "w", encoding="utf-8", newline="\n") as f:
+        for domain in sorted_domains:
+            f.write(f"0.0.0.0 {domain}\n")
+
+    return len(sorted_domains)
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Download multiple blocklists from URLs and create one MikroTik adlist in '0.0.0.0 domain' format."
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        action="append",
+        dest="urls",
+        help="Blocklist URL. Can be used multiple times.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="adlist.txt",
+        help="Output file.",
+    )
+    return parser.parse_args()
+
+def main() -> int:
+    args = parse_args()
+    urls = args.urls or DEFAULT_URLS
+
+    if not urls:
+        print("Error: no URLs were provided. Use -u URL or edit DEFAULT_URLS.", file=sys.stderr)
+        return 1
+
+    count = build_output(urls, args.output)
+    print(f"[OK] Output written to: {args.output}", file=sys.stderr)
+    print(f"[OK] Total unique domains: {count}", file=sys.stderr)
+    return 0
+
+if __name__ == "__main__":
+    raise SystemExit(main())