Programma... info e suggerimenti.

di il
4 risposte

Programma... info e suggerimenti.

Salve ragazzi sono nuovo nel forum ed è da giorni che provo a dare una risposta al mio problema ma non la trovo...... considerando che nelle mie memorie di programmazioni passate (da oramai 10 anni che non ci metto mano) non è rimasto granchè...

Volevo chiedervi un consiglio/aiuto sulla soluzione migliore...

dovrei creare un "programma" capace di decifrare le "sorgenti" di una pagina web e "trasformarle" in una tabella all'interno di una pagina html.

Cosa potete consigliarmi / suggerirmi prima di iniziare a buttare giu una bozza che poi dovrei buttare?

Ringrazio in anticipo chi potrà darmi una mano...se avete bisogno di altre info riguardo al mio "problema" ditemi che vi spiego meglio..

Grazie e buona serata.

Antonio.

4 Risposte

  • Re: Programma... info e suggerimenti.

    C'è un utente nel forum particolarmente esperto su queste elaborazioni vedrai ti spiegherà
  • Re: Programma... info e suggerimenti.

    Grazie per avermi avvertito lo aspetterò allora...ricordi per caso il nick?
  • Re: Programma... info e suggerimenti.

    Ciao, non penso sia la sezione giusta per parlare di scraping
  • Re: Programma... info e suggerimenti.

    Problema... non riesco a convertire le sorgenti in "tabella" qualcuno riesce ad aiutarmi?
    <!DOCTYPE html>
    <html lang="en">
    <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="description" content="Convert HTML to CSV/Excel" />
    <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
    <meta http-equiv="Pragma" content="no-cache" />
    <meta http-equiv="Expires" content="0" />
    
    <title>HTML To CSV/Excel Converter</title>
    <script src="/js/underscore-min.js"></script>
    <script src="/js/blob.js"></script>
    <script src="/js/filesaver.js"></script>
    <script src="json2.js"></script>
    <script src="strsup.js?v=7"></script>
    <script src="localread.js?v=7"></script>
    <script src="csvparse.js?v=7"></script>
    <script src="csvsup.js?v=25x"></script>
    
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js" type="text/javascript"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
    <script src="https://cdn.jsdelivr.net/alasql/0.3.8/alasql.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.11.5/xlsx.full.min.js"></script>
    <link rel="stylesheet" href="/js/custom.css?v=1">
    
    <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
    <script>
         (adsbygoogle = window.adsbygoogle || []).push({
              google_ad_client: "ca-pub-2674404638298268",
              enable_page_level_ads: true
         });
    </script>
    
    <script type="text/javascript">
    function assignText(s) {
       document.getElementById('txt1').value = s;
       document.getElementById('btnRun').click();
    }
    function runit() {
        var delimiter = radiovalue(document.getElementById('frm1').outsep);
        var noMultiLines = document.getElementById('chkNoBreaks').checked;
        if (delimiter == "o") delimiter = document.getElementById("outSepOtherVal").value;
        var whichTable = document.getElementById('selTabNum').value;
        whichTable = whichTable || "0";
        var bQuotes = (document.getElementById('chkCsvQuotes')).checked;
        var removeTags = (document.getElementById('chkRemoveTags')).checked;
        var crunch = (document.getElementById('chkCrunch')).checked;
        var html=document.getElementById('divHtml');
        html.innerHTML = document.getElementById('txt1').value.replace(/<script/gmi, "<xxxxx").replace(/<style/gmi, "<yyyyy").replace(/<img/gmi, "<zzzz").replace(/<link/gmi, "<abcd");
      //html.innerHTML=document.getElementById('txt1').value.replace(/<script[^>]*>([\\S\\s]*?)<\/script>/img,""); // to do
        var s="";
        var cells;
        var value;
        var tbl = html.getElementsByTagName('table');
        var cnt=tbl.length;
        var re = new RegExp("<\/?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)\/?>",'igm');
    
        for(var j=0;j<tbl.length;j++) {
            if( (""+(j+1)) != whichTable && whichTable!="0") continue;
            rows=tbl[j].getElementsByTagName('tr');
            for(var k=0;k<rows.length;k++) {
                if ('querySelectorAll' in document) {
                   cells=rows[k].querySelectorAll('td,th');
                } else {
                   cells=rows[k].getElementsByTagName('td');
                   if(!cells || cells.length==0) {
                      cells=rows[k].getElementsByTagName('th');
                   }
                }
    
                for(var n=0;n<cells.length;n++) {
                   value=cells[n].innerHTML;
                   if(value==null)value="";else value+="";
                   value = value.replace(/\r\n|\r|\n/gmi,' ');
                   if(noMultiLines) value = value.replace(/\n|<br>|<br\/>|<br \/>/gmi,' ');
                   else {
                       value = value.replace(/<br>|<br\/>|<br \/>/gmi,'\n');
                       value = value.replace(/(<li>|(<li (?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>))/gmi,'\n');
                   }
                   if (removeTags) value=value.replace(re,''); 
                   value=_.unescape(value);
                   value=value.replace(/&nbsp;/gmi," ");
                   if(crunch)value=value.replace(/ {2,}/g, ' ');
                   value=value.trim();
                   if(bQuotes) {
                      s += '"' + value.replace(/"/gmi, '""') + '"' + delimiter; 
                   }
                   else {
                      s+=value.toCsv(delimiter,'"') + delimiter; 
                   } 
               }
               s=s.slice(0, delimiter.length*-1); // remove last delimiter
               s += "\n"; 
            } 
        }
        document.getElementById('txta').value = s; 
        if(cnt<1 && document.getElementById('txt1').value.trim() != "") {
           window.alert('No TABLE tag found in HTML. Please check your input.');
        }
        s = "<select id=\"selTabNum\"  onchange=\"document.getElementById('btnRun').click()\">";
        s += "<option value=0>-All-</option>"
        for(j=0;j<cnt;j++) {
           s+= "<option value=\"" + (j+1) + "\" ";
           if( (j+1)==whichTable) s+=" selected";
           s+= ">" + getOrdinal(j+1) + "</option>";
        }
        s+="</select>";
        document.getElementById("spanTabNum").innerHTML = s;
        document.getElementById('spanCount').innerHTML = "(Tables found: " + cnt + ")";
    }
    function runExample() {
        if (document.getElementById("urlTabLink")) document.getElementById("urlTabLink").click();
        document.getElementById('url').value = 'http://www.tournamentmaps.com/florida-tennis-colleges.htm';
        document.getElementById('btnUrl').click();
    }
    </script>
    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-110011798-1"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
    
      gtag('config', 'UA-110011798-1');
    </script>
    </head> 
    <body onload="document.getElementById('btnRun').click()">
    
    <div class="container-fluid">
        <div class="row">
    	    <div class="jumbotron pageHeader">
                    <h1 class="text-center">Prova SPQR</h1>
            </div>
        </div>
        <div class="row">
            <div class="container-fluid pageSubheader">
                <h2>Prova SPQR</h2>
            </div>
        </div>
    	<div class="row">
        <div class="col-md-3">
            <script src="/js/csvlinks.js?v=6"></script>
    <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
    <!-- Downsides Wide -->
    <ins class="adsbygoogle"
         style="display:inline-block;width:300px;height:600px"
         data-ad-client="ca-pub-2674404638298268"
         data-ad-slot="2020261410"></ins>
    <script>
    (adsbygoogle = window.adsbygoogle || []).push({});
    </script>
    
     
    
        </div>
        <div class="col-md-9">
    
    <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
    <!-- ResponsiveAd -->
    <ins class="adsbygoogle"
         style="display:block"
         data-ad-client="ca-pub-2674404638298268"
         data-ad-slot="9209491414"
         data-ad-format="auto"></ins>
    <script>(adsbygoogle = window.adsbygoogle || []).push({});</script>
    
        <form id="frm1" name="frm1" class="form-inline" role="form" onsubmit="return false">
           
                <ul class="nav nav-tabs">
                    <li class="nav-item active"><a id="defaultTabLink" data-toggle="tab" href="#inputtext">Enter Data</a></li>
                    <li class="nav-item"><a id="fileTabLink" data-toggle="tab" href="#inputfile" class="nav-link">Choose File</a></li>
                    <li class="nav-item"><a id="urlTabLink" data-toggle="tab" href="#inputurl" class="nav-link">Enter URL</a></li>
                </ul>
                <br />
    
                <div class="tab-content">
                    <div id="inputtext" class="tab-pane active">
    <textarea class="form-control" style="width: 90%;" rows="10" cols="80" id="txt1" wrap="off" placeholder="Enter or paste here"></textarea>
                    </div>
                    <div id="inputfile" class="tab-pane">
                        <label xclass="form-control">Choose File<input type="file" id="f1" class="form-control" onchange="loadTextFile(this,assignText,event)" title="Choose a local HTML file" /></label>
    
                        <label for="txtEncoding"></label><span id="spanEncoding">Encoding</span>
                        <select id="txtEncoding" class="form-control" title="Enter encoding for input file" onchange="loadTextFile(document.getElementById('f1'),assignText)">
                            <option value="" selected="selected">-Default-</option>
                            <option value="ISO-8859-1">ISO-8859-1 (Latin No. 1)</option>
                            <option value="ISO-8859-2">ISO-8859-2 (Latin No. 2)</option>
                            <option value="ISO-8859-3">ISO-8859-3 (Latin No. 3)</option>
                            <option value="ISO-8859-4">ISO-8859-4 (Latin No. 4)</option>
                            <option value="ISO-8859-5">ISO-8859-5 (Latin/Cyrillic)</option>
                            <option value="ISO-8859-6">ISO-8859-6 (Latin/Arabic)</option>
                            <option value="ISO-8859-7">ISO-8859-7 (Latin/Greek)</option>
                            <option value="ISO-8859-8">ISO-8859-8 (Latin/Hebrew)</option>
                            <option value="ISO-8859-9">ISO-8859-9 (Latin No. 5)</option>
                            <option value="ISO-8859-13">ISO-8859-13 (Latin No. 7)</option>
                            <option value="ISO-8859-15">ISO-8859-15 (Latin No. 9)</option>
                            <option value="macintosh">Mac OS Roman</option>
                            <option value="UTF-8">UTF-8</option>
                            <option value="UTF-16">UTF-16</option>
                            <option value="UTF-16BE">UTF-16 (Big-Endian)</option>
                            <option value="UTF-16LE">UTF-16 (Little-Endian)</option>
                            <option value="UTF-32">UTF-32</option>
                            <option value="UTF-32BE">UTF-32 (Big-Endian)</option>
                            <option value="UTF-32LE">UTF-32 (Little-Endian)</option>
                            <option value="windows-1250">windows-1250 (Win East European)</option>
                            <option value="windows-1251">windows-1251 (WinCyrillic)</option>
                            <option value="windows-1252">windows-1252 (WinLatin-1)</option>
                            <option value="windows-1253">windows-1253 (WinGreek)</option>
                            <option value="windows-1254">windows-1254 (Win Turkish)</option>
                            <option value="windows-1255">windows-1255 (Win Hebrew)</option>
                            <option value="windows-1256">windows-1256 (Win Arabic)</option>
                            <option value="windows-1257">windows-1257 (Win Baltic)</option>
                            <option value="windows-1258">windows-1257 (Win Vietnamese)</option>
                        </select>
    
                    </div>
                    <div id="inputurl" class="tab-pane">
                        <label>
                            Enter URL as data source
                            <input type="text" size="40" value="" name="url" id="url" class="form-control" title="Enter the URL of a web page returning HTML with a table" />
                        </label>
                        <input type="button" id="btnUrl" class="btn btn-primary" value="Load URL" title="Load HTML via URL" onclick="loadURL(document.getElementById('url').value)" />
                    </div>
                </div>
    
                <div class="">
                    <input type="button" class="btn btn-primary" value="Clear Input" onclick="window.location.reload(true)">
                    &nbsp; <input type="button" value="Example" class="btn btn-primary" title="Load and run example" onclick="runExample()">
                </div>
    
            </div>
    
            <br />
    
            <h3 class="headerBlue">Opzionali<small></h3><a href="#" onclick="return false" data-toggle="collapse" data-target="#p4"> <span class="glyphicon glyphicon-chevron-down"></span></a>
            <hr class="noverticalspace" />
            <fieldset class="scheduler-border collapse" id="p4">
                <legend class="scheduler-border">Output Options</legend>
                Output Field Separator:
                <label><input type="radio" name="outsep" id="outSepComma" value="," checked="checked"> ,</label> &nbsp;
                <label><input type="radio" name="outsep" id="outSepSemicolon" value=";"> ;</label> &nbsp;
                <label><input type="radio" name="outsep" id="outSepColon" value=":"> :</label> &nbsp;
                <label><input type="radio" name="outsep" id="outSepPipe" value="|"> Bar-|</label> &nbsp;
                <label><input type="radio" name="outsep" id="outSepTab" value=" " onclick="this.value='\t'"> Tab</label> &nbsp;
                <label><input type="radio" name="outsep" id="outSepOther" value="o"> Other-Choose</label>
                <label><input type="text" size="2" id="outSepOtherVal" value="@" /></label>
                <br />
                <label><input id="chkCsvQuotes" type="checkbox" /> Force Wrap values in double quotes</label>
                <br />
                <label><input type="checkbox" id="chkNoBreaks" value="Y"> No line breaks in CSV</label>
                (Use this to remove line breaks in field values)
                <br />
                <label><input type="checkbox" id="chkRemoveTags" value="Y" checked="checked"> Remove HTML tags in CSV</label>
                (Use this to prevent seeing HTML tags in your output)
                <br />
                <label><input type="checkbox" id="chkCrunch" value="Y" checked="checked"> Replace multiple spaces with 1 space in CSV</label>
                (Use this if you have too much space in your output)
            </fieldset>
            <br />
            <h3 class="headerBlue">Step 3: Generate output</h3><br />
            <input type="button" id="btnRun" class="btn btn-primary" value="Convert HTML To CSV" title="Convert HTML Table To CSV" onclick="runit();return false">
            <input type="button" class="btn btn-primary" onclick="runit();saveExcel('txta',false);return false" value="HTML To Excel" title="Save output as an Excel file" />
            &nbsp;
            <label>
                Which table? <span id="spanTabNum">
                    <select id="selTabNum" onchange="document.getElementById('btnRun').click()">
                        <option value="0" selected="selected">All</option>
                    </select>
                </span>
            </label>
            <span id="spanCount"></span>
    
    	<div class="form-group-inline">
    		<label for="txta" class="control-label">Result Data:</label><br/>
    		<textarea id="txta" rows="15" cols="80" style="width:90%" wrap="off" placeholder="Output Results" class="form-control"></textarea>
    	</div><br />
    	<div class="form-group form-inline">
    		<label>Save your result:</label>
    			<input type="text" size="15" id="fn" value="convertcsv" class="form-control" title="Enter filename without extension" />.csv
    			<button class="btn btn-primary" onclick="saveFile(document.getElementById('txta').value,'csv');return false"><span class="glyphicon glyphicon-save-file"></span> Download Result</button>
    			<label title="End-of-Line">EOL: <select id="eol" title="CRLF=Windows,LF=Unix/Linux/New Apple/Android"><option value="">CRLF</option><option value="LF">LF</option></select></label>
    
        </div> 
     </form>
    </div>
    <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
    <!-- ResponsiveAd -->
    <ins class="adsbygoogle"
         style="display:block"
         data-ad-client="ca-pub-2674404638298268"
         data-ad-slot="9209491414"
         data-ad-format="auto"></ins>
    <script>(adsbygoogle = window.adsbygoogle || []).push({});</script>
    
        </div>
        </div>
        <div class="row">
    	    <div class="col-md-12">
    
    	        <div class="panel-footer">
                    <script src="/js/footer.js"></script>
                </div>
    	    </div>
        </div>	
    <div id="divHtml" style="display:none; visibility: hidden; height: 5px"></div>
    </div>
    </body>
    </html>
    
Devi accedere o registrarti per scrivere nel forum
4 risposte