TFCweb: Fitxer de Codi src/Uri.cc

Veure la documentació d'aquest fitxer.
 
 /*
  * Copyright (c) 2012 Toni Corvera
  *
  * This file is part of TFCWeb.
  *
  * TFCWeb is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * TFCWeb is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with TFCWeb. If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "Uri.h"
 #include "utils.h"
 
 #include <cassert>
 #include <iostream>
 #include <locale>
 #include <string>
 #include <vector>
 #include <boost/algorithm/string.hpp>
 #include <boost/array.hpp>
 #include <boost/assign.hpp>
 #include <boost/filesystem.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/tokenizer.hpp>
 #include <boost/xpressive/xpressive_static.hpp>
 
 using namespace boost;
 using namespace std;
 using namespace boost::xpressive;
 
 namespace bx = boost::xpressive;
 
 namespace {
 
 using namespace tfc;
 
 // Inserció estàndard:
 // std::copy(comp.begin(), comp.end(), ostream_iterator<string>(oss, "/"));
 template<typename S, typename I> S join_r(const I & beg, const I & end, const S & sep) {
     S ret;
     for (I it = beg; it != end; ++it) {
         ret.append(sep);
         ret.append(*it);
     }
     return ret;
 }
 
 template<typename TipusStr>
 struct FunctorCodificacio {
     typedef typename TipusStr::value_type TipusChar;
     TipusStr & desti;
     FunctorCodificacio(TipusStr & destinacio) : desti(destinacio) {}
     static bool cal_codificar(TipusChar c) {
         static boost::array<char,15> NO_SEGURS = { { '{', '}', '|', '\\', '^', '~', '[', ']', '.',
                                                      // Reservables segons esquema:
                                                      '/', '?', ':', '@', '=', '&'
                                                  } };
         return (c > 127) || // fora d'US-ASCII
                (c < 0x1F) || (c == 0x7F) || // caràcters de control
                (NO_SEGURS.end() != find(NO_SEGURS.begin(), NO_SEGURS.end(), c)) // no segurs
         ;
     }
 
     void operator()(const TipusChar & c) {
         // Caracters no segurs (FIXME:)
         // Caracters fora de rang US-ASCII
         if (cal_codificar(c)) {
             string hex = utils::to_string<int>(c, std::hex);
             boost::to_upper(hex);
             desti.append("%"+hex);
             return;
         }
         desti.append(1, c);
     }
 };
 
 string reconstrueix(const string & esquema, const string & autoritat,
                     const string & ruta, const string & params,
                     const string & query, const string & fragment)
 {
     ostringstream ss;
     if (!esquema.empty()) {
         ss << esquema << ":";
     }
 
     if (!autoritat.empty()) {
         ss << "//" << autoritat;
     }
 
     ss << ruta;
 
     if (!params.empty()) {
         ss << ';' << params;
     }
 
     if (!query.empty()) {
         ss << '?' << query;
     }
 
     if (!fragment.empty()) {
         ss << '#' << fragment;
     }
     return ss.str();
 }
 
 inline string reconstrueix(const Uri & u) {
     return reconstrueix(u.esquema(), u.autoritat(), u.ruta(),
                         u.params(), u.query(), u.fragment());
 }
 
 } // ns anònim
 
 // ------- Sintaxi d'URIs --------
 //
 // Regles extretes de la secció "Augmented BNF" de l'RFC 2068 (pàg. 15)
 // expressades amb Boost Xpressive.
 //
 // RFC 2068, pg.18, 19:
 //
 // URI = ( absoluteURI | relativeURI ) [ "#" fragment ]
 //
 // absoluteURI = scheme ":" *( uchar | reserved )
 // relativeURI = net_path | abs_path | rel_path
 //
 // net_path = "//" net_loc [ abs_path ]
 // abs_path = "/" rel_path
 // rel_path = [ path ] [ ";" params ] [ "?" query ]
 //
 // path = fsegment *( "/" segment )
 // fsegment = 1*pchar
 // segment = *pchar
 //
 // params = param *( ";" param )
 // param = *( pchar | "/" )
 //
 // scheme = 1*( ALPHA | DIGIT | "+" | "-" | "." )
 // net_loc = *( pchar | ";" | "?" )
 //
 // query = *( uchar | reserved )
 // fragment = *( uchar | reserved )
 //
 // pchar = uchar | ":" | "@" | "&" | "=" | "+"
 // uchar = unreserved | escape
 // unreserved = ALPHA | DIGIT | safe | extra | national
 //
 // escape = "%" HEX HEX
 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+"
 // extra = "!" | "*" | "’" | "(" | ")" | ","
 // safe = "$" | "-" | "_" | "."
 // unsafe = CTL | SP | <"> | "#" | "%" | "<" | ">"
 // national = <any OCTET excluding ALPHA, DIGIT,
 //             reserved, extra, safe, and unsafe>
 //
 // ----------------------------------------------------------------------
 //
 // En Xpressive els parèntesis no capturen, només agrupen.
 // La diferència entre `set[ ]` i `( )` és que els sets es poden negar amb `~`
 namespace RFC2068_Sintaxi_URIs {
 
           // unsafe = CTL | SP | <"> | "#" | "%" | "<" | ">"
 const sregex unsafe = imbue(locale::classic())(bx::set[ cntrl | char(32) | '"'
                       | '#' | '%' | '<' | '>' ])
           // safe = "$" | "-" | "_" | "."
             ,safe = bx::set[ as_xpr('$') | '-' | '_' | '.' ]
           // extra = "!" | "*" | "’" | "(" | ")" | ","
             ,extra = bx::set[ as_xpr('!') | '*' | '\'' | '(' | ')' | ',' ]
           // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+"
             ,reserved = bx::set[ as_xpr(';') | '/' | '?' | ':' | '@' | '&' | '=' | '+' ]
             ,escape = '%' >> xdigit >> xdigit // "%" HEX HEX
             // national no compila amb referències (normals o by_ref() a reserved, etc.)
             // national = <any OCTET excluding ALPHA, DIGIT, reserved, extra, safe, and unsafe>
             ,national = imbue(locale::classic())(~bx::set[ alpha | digit
                             | ';' | '/' | '?' | ':' | '@' | '&' | '=' | '+' // reserved
                             | '!' | '*' | '\'' | '(' | ')' | ',' // extra
                             | '$' | '-' | '_' | '.' // safe
                             | cntrl | char(32) | '"' | '#' | '%' | '<' | '>' // unsafe
             ])
 ;
 
           // unreserved = ALPHA | DIGIT | safe | extra | national
 const sregex unreserved = imbue(locale::classic())( alpha | digit | safe | extra | national );
 const sregex uchar = ( unreserved | escape ); // = unreserved | escape
           // pchar = uchar | ":" | "@" | "&" | "=" | "+"
 const sregex pchar = ( uchar | ':' | '@' | '&' | '=' | '+' ) ;
 const sregex query = *( uchar | reserved ) // = *( uchar | reserved )
             ,fragment = *( uchar | reserved ) // = *( uchar | reserved )
 ;
 const sregex net_loc = *( pchar | ';' | '?' ) // = *( pchar | ";" | "?" )
           // scheme = 1*( ALPHA | DIGIT | "+" | "-" | "." )
             ,scheme = +( alpha | digit | '+' | '-' | '.' )
 ;
 const sregex param = *( pchar | '/' ); // *( pchar | "/" )
 const sregex params = param >> *( ';' >> param ); // = param *( ";" param )
 
 const sregex segment = *pchar // = *pchar
             ,fsegment = +pchar; // = 1*pchar
 const sregex path = fsegment >> *( '/' >> segment ); // = fsegment *( "/" segment )
 
           // rel_path = [ path ] [ ";" params ] [ "?" query ]
 const sregex rel_path = !path >> !( ';' >> params ) >> !( '?' >> query );
 const sregex abs_path = '/' >> rel_path ; // = "/" rel_path
 const sregex net_path = "//" >> net_loc >> !abs_path; // = "//" net_loc [ abs_path ]
 
           // absoluteURI = scheme ":" *( uchar | reserved )
 const sregex absoluteURI = scheme >> ':' >> *( uchar | reserved );
           // relativeURI = net_path | abs_path | rel_path
 const sregex relativeURI = net_path | abs_path | rel_path ;
 
           // URI = ( absoluteURI | relativeURI ) [ "#" fragment ]
 const sregex URI = ( absoluteURI | relativeURI ) >> !( '#' >> fragment ) ;
 
 // Extracció de components
 const sregex
              // La definició d'URI és massa genèrica ("greedy") per aplicar en extraccions
              // 1: esquema, 2: URL (relativa), 3: fragment
              ex_URI = !( (s1=scheme) >> ':' ) >> ( s2=relativeURI ) >> !( '#' >> (s3=fragment) ),
              // 1: host, 2: ruta
              ex_net_path = "//" >> (s1 = net_loc) >> (s2 = !abs_path),
              // 1: ruta, 2: paràmetres, 3: query string
              ex_rel_path = !( s1=path ) >> !( ';' >> (s2=params) ) >> !( '?' >> (s3=query) )
 ;
 
 } // ns RFC2068_Sintaxi_URIs
 
 namespace {
 inline bool es_parseable(const string & s) {
     return regex_match(s, RFC2068_Sintaxi_URIs::URI);
 }
 } // ns anònim (continuació)
 
 namespace tfc {
 
 Uri::Uri(const string & uri) throw (ErrorAnalisiUrl) {
     using namespace RFC2068_Sintaxi_URIs;
 
     if (!regex_match(uri, URI)) {
         throw ErrorAnalisiUrl();
     }
     smatch m;
     regex_match(uri, m, ex_URI);
     const string esquema = m[1], relativa = m[2], fragment = m[3];
     assert( regex_match(relativa, relativeURI) );
     bool es_net_path = regex_match(relativa, net_path),
          es_abs_path = regex_match(relativa, abs_path),
          es_rel_path = regex_match(relativa, rel_path);
     assert( es_net_path | es_abs_path | es_rel_path );
     string netloc, abspath, relpath;
     if (es_net_path) {
         regex_match(relativa, m, ex_net_path);
         netloc = m[1];
         abspath = m[2];
     }
     else if (es_abs_path) {
         abspath = relativa;
     }
     else if (es_rel_path) {
         relpath = relativa;
     }
     if (!abspath.empty()) {
         assert( abspath[0] == '/' );
         relpath = abspath.substr(1);
     }
     string ruta, params, query;
     if (!relpath.empty()) {
         regex_match(relpath, m, ex_rel_path);
         ruta = m[1];
         params = m[2];
         query = m[3];
     }
     if (!abspath.empty()) {
         ruta = "/" + ruta;
     }
 
     esquema_ = esquema;
     autoritat_ = netloc;
     ruta_ = descodifica(ruta); // throws ErrorAnalisiUrl
     query_ = query;
     fragment_ = fragment;
     params_ = params;
     // FIXME: Què més cal descodificar?
 } // Uri::Uri()
 
 Uri& Uri::operator=(const Uri & u)  {
     esquema_ = u.esquema_;
     autoritat_ = u.autoritat_;
     ruta_ = u.ruta_;
     params_ = u.params_;
     query_ = u.query_;
     fragment_ = u.fragment_;
     return *this;
 }
 
 void Uri::ruta(const string & r) throw (ErrorAnalisiUrl) {
     ruta_ = descodifica(r);
 }
 
 bool Uri::operator==(const Uri & u) const {
     if (this == &u) {
         return true;
     }
     return str() == u.str(); // Equivalència
 }
 
 bool Uri::es_uri_absoluta() const {
     return esquema_.length() != 0 && autoritat_.length() != 0;
 }
 
 bool Uri::te_ruta_absoluta() const {
     return ruta_.empty() || ruta_.at(0) == '/';
 }
 
 string Uri::str() const {
     return reconstrueix(*this);
 }
 
 // FIXME: Portabilitat de path, veure http://www.boost.org/doc/libs/1_49_0/libs/filesystem/v3/doc/reference.html#Path-decomposition-table
 
 // static
 Uri Uri::normalitza(const Uri & u) {
     assert( es_parseable(u) ); // INVARIANT de Uri
 
     using filesystem::path;
     string p = u.ruta();
     // Casos trivials
     if (p.empty() || p == "/") {
         return u;
     }
     if (p == ".") {
         Uri copia(u);
         copia.ruta_.clear();
         return copia;
     }
 
     // Eliminació de barres repetides
     const sregex dblbarra = as_xpr('/') >> +as_xpr('/'); // = regex("//+")
     p = regex_replace(p, dblbarra, "/"); // , boost::match_default | boost::format_perl);
     if (p == "/") {
         Uri copia(u);
         copia.ruta_.swap(p);
         return copia;
     }
     // Interpretació de '.' i '..'
     const char_separator<char> sepdir("/");
     tokenizer< char_separator<char> > tokens(p, sepdir);
 
     vector<string> comp;
     for (auto it = tokens.begin(); it != tokens.end(); ++it) {
         const string & elem = *it;
         if (elem == ".") {
             // Ignorar
             comp.push_back(string());
             continue;
         }
         if (elem == "..") {
             // Elimina el directori actual, si n'hi ha
             if (!comp.empty()) {
                 comp.pop_back();
             }
             continue;
         }
         comp.push_back(elem);
     }
 
     string nova_ruta = join_r(comp.begin(), comp.end(), string("/"));
     // Retindre "/" a l'inici si en tenia i l'ha perdut
     if (nova_ruta.empty() && !p.empty() && p.at(0) == '/') {
         nova_ruta = "/";
     }
     const string nova_url = reconstrueix(u.esquema(), u.autoritat(),
                                          nova_ruta, u.params(),
                                          u.query(), u.fragment());
     assert( es_parseable(nova_url) );
     return Uri(nova_url);
 }
 
 // static
 string Uri::descodifica(const string & s, bool & error) throw() {
     string ret;
     ret.reserve(s.length());
     error = false;
     // Caràcters hexadecimals, expressió regular per simplicitar comprovació
     // en xpressive "xdigit >> xdigit" (= '[[:xdigit:][:xdigit:]]')
     using namespace boost::xpressive;
     const sregex hex_re = xdigit >> xdigit;
     string::size_type idx = 0, percent;
     while (string::npos != (percent = s.find('%', idx))) {
         assert( s.at(percent) == '%' );
         // copiem la part de la cadena saltada [idx, percent):
         ret.append(s, idx, percent-idx);
         // comprovem que es tracta d'un codi hexadecimal vàlid
         // si no fós el cas ho acceptem, però marquem l'error
         if (s.length() < (percent+3)) {
             error = true;
             idx = percent+1;
             ret.append(1, s.at(percent));
             continue;
         }
         assert( s.length() >= (percent+3) );
         const string hex(s.substr(percent+1, percent+3), 0, 2);//0,2 necessari!
         assert( hex.length() == 2 );
         if (!regex_match(hex, hex_re)) {
             // Caràcters NO hexadecimals, com abans marquem error i saltem
             error = true;
             idx = percent+1;
             ret.append(1, s.at(percent));
             continue;
         }
         const int C = tfc::utils::from_string<int>(hex, std::hex);
         assert( C <= 0xFF ); // dos dígits hexadecimals <-> valors d'un byte
         const char c = static_cast<char>(C);
         ret.append(1, c);
         idx = percent+3;
     }
     // resta
     ret.append(s.substr(idx));
     return ret;
 }
 
 // static
 string Uri::descodifica(const string & url) throw (ErrorAnalisiUrl) {
     bool e;
     const string s = descodifica(url, e);
     if (e) {
         throw ErrorAnalisiUrl("Error descodificant URL");
     }
     return s;
 }
 
 // static
 string Uri::codifica(const string & s) throw () {
     string ret;
     ret.reserve(s.length()); // mínim
     for_each(s.begin(), s.end(), FunctorCodificacio<string>(ret));
     return ret;
 }
 
 // friend
 ostream& operator<<(ostream & os, const Uri & u) {
 #if 0
     return os << "["  << u.esquema() << ", " << u.autoritat()
               << ", " << u.ruta() << ", " << u.query() << ", "
               << u.fragment() << "]";
 #endif
     return os << string(u);
 }
 
 } // ns tfc
 
 // vim:set ts=4 et ai: //