#include "urls.hh"

#include <algorithm>
#include <cassert>

using namespace std;

static bool not_url_char(char c) {
    // Special characters that can appear in URLs
    static const string url_ch = "~;/?:@=&$-_.+!*'(),";
    // Return false if `c` can appear in URLs
    return !(isalnum(c)
        || find(url_ch.begin(), url_ch.end(), c) != url_ch.end());
}

static string::const_iterator
url_end(string::const_iterator b, string::const_iterator e) {
    return find_if(b, e, not_url_char);
}

static string::const_iterator
url_begin(string::const_iterator b, string::const_iterator e) {
    static const string sep = "://";
    typedef string::const_iterator iter;
    iter i = b; // `i` marks where separator was found
    while ((i = search(i, e, sep.begin(), sep.end())) != e) {
        // Make sure the separator isn't at the end of string
        if (i + sep.size() != e) {
            iter beg = i; // `beg` marks start of protocol-name
            while (beg != b && isalpha(beg[-1]))
                --beg;
            // At least one good char before and after ://?
            if (beg != i && !not_url_char(i[sep.size()]))
                return beg;
        }
        // Found separator wasn't part of a URL, move past it
        i += sep.size();
    }
    return e;
}

vector<string> find_urls(const string& s) {
    vector<string> ret;
    typedef string::const_iterator iter;
    // Look through the entire input
    iter b = s.begin(), e = s.end();
    while (b != e) {
        // Look for one or more letters followed by `://`
        b = url_begin(b, e);
        // If we found it
        if (b != e) {
            // Get the rest of the URL
            iter after = url_end(b, e);
            // Remember the URL
            ret.push_back(string(b, after));
            // Advance `b` and check for more URLs
            b = after;
        }
    }
    return ret;
}

int main() {
    vector<string> urls = find_urls("");
    assert(urls.size() == 0);
    urls = find_urls("no URL here");
    assert(urls.size() == 0);
    urls = find_urls("://missing-protocol http://abc.def garbage");
    assert(urls.size() == 1);
    assert(urls[0] == "http://abc.def");
    urls = find_urls("ftp://one.two\nhttp://abc.def\ngarbage://");
    assert(urls.size() == 2);
    assert(urls[0] == "ftp://one.two");
    assert(urls[1] == "http://abc.def");
    urls = find_urls("ftp://one.two\nhttp://abc.def\ngarbage://}");
    assert(urls.size() == 2);
    assert(urls[0] == "ftp://one.two");
    assert(urls[1] == "http://abc.def");

    return 0;
}
