--------Apache Version 2.0 license (AL2.0)----------------------------------------------------------------------------
/*
        Copyright 2021 Andreas Burgstaller

        Licensed under the Apache License, Version 2.0 (the "License");
        you may not use this file except in compliance with the License.
        You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

        Unless required by applicable law or agreed to in writing, software
        distributed under the License is distributed on an "AS IS" BASIS,
        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
        See the License for the specific language governing permissions and
        limitations under the License.
*/
----------------------------------------------------------------------------------------------------------------------
test_mode = .true -- set false to scrape the total number of ad entries

homepage = ("https://www.findmyhome.at")
adlist_path = ("/immo/wohnung-mieten/wien")

CALL startWebscraping_FMH homepage, adlist_path, test_mode
entryList = result

System = bsf.import("java.lang.System")
current_folder = System~getProperty("user.dir")

path = current_folder"\output\temp_ist.json"
CALL writeJSONtoFile entryList~toString, path

----------------------------------------------------------------------------------------------------------------------
::REQUIRES "BSF.CLS"
----------------------------------------------------------------------------------------------------------------------

--Routine to write the JSON Array into a File
::ROUTINE writeJSONtoFile
  PARSE ARG input, path

    fileWriter = bsf.import("java.io.FileWriter")
    file = fileWriter~NEW(path,.false)
    file~write(input)
    file~flush
    file~close

----------------------------------------------------------------------------------------------------------------------

--Routine to start the ETL-Process for the website www.findmyhome.at
::ROUTINE startWebscraping_FMH
    PARSE ARG homepage, adlist_path, test_mode
        --import Jsoup Ressources
        Jsoup = bsf.import("org.jsoup.Jsoup")

        entryList = .bsf~new("org.json.JSONArray") --create JSON Array

        SAY "Start Webscraping:" homepage""adlist_path

        mainpage = Jsoup~connect(homepage""adlist_path)~get()

        IF test_mode THEN
            DO
                adCount = 200 -- only scrape the first 200 ad entries
            END
        ELSE
            DO
                --get the total number of ad entries and set the adCount
                adCount = mainpage~getElementsByClass("hidden-md hidden-lg col-xs-12")~first()~getElementsByTag("b")~first()
                adCount  = adCount~ownText()~delWord(2,2)
            END


        DO i = 0 TO adCount BY 20
            mainpage = Jsoup~connect(homepage""adlist_path"?entry="i)~get()
            adlist = mainpage~getElementsByClass("obj_list")

            DO j = 0 TO (adlist~size - 1)
                adref = adlist~get(j)~getElementsByTag("a")~first()~attr("href")
                IF (adref \= "") THEN
                    DO
                        link = homepage""adref
                        CALL scrapeAdEntry_FMH link -- returns JSON Object
                        entryList~put(result) --add JSON Object to Array
                    END
            END
        END

        RETURN entryList

----------------------------------------------------------------------------------------------------------------------

--Routine for the Extraction and Transformation of a single Findmyhome Entry
::ROUTINE scrapeAdEntry_FMH
    PARSE ARG url

    --Load the HTML Document of the Ad Entry
    Jsoup = bsf.import("org.jsoup.Jsoup")
    adpage = Jsoup~connect(url)~get()

    --Extraction of the nesseccary values from the HTML code
    --Transformation of the Data into Objects

    --Extract & Transform for  TABLE DATA_SOURCE
    data_source = .data_source~NEW()
    --COLUMN <NVARCHAR> SOURCE_NAME
    data_source~name="FindMyHome"
    --COLUMN <NVARCHAR> URL (value already collected)
    data_source~url=url
    --COLUMN <NVARCHAR> INTERNAL_ID
    temp = adpage~getElementById("khandyanzeigenMobile")
    IF ( temp \= .nil & value \= "") THEN
        DO
            data_source~internal_id=temp~getElementsByTag("b")~last()~ownText()
        END
    ELSE
        DO
            data_source~internal_id=""
        END

    --Extract & Transform for TABLE CHRONOLOGY
    chronology = .chronology~NEW()
    --COLUMN AD_CREATION_TIME <SECONDDATE>
    temp = adpage~getElementsByAttributeValue("name","openimmo_updated_at")~first()
    IF (temp \= .nil & temp \= "") THEN
        DO
            chronology~creation_time=temp~attr("content")
        END
    ELSE
        DO
            chronology~creation_time="0001-01-01 00:00:00.0000000"

        END
    --COLUMN AVAILABLE_FROM <DATE>
    temp = adpage~select("tr:contains(Beziehbar ab) td")~last()
    IF ( temp \= .nil & temp \= "") THEN
        DO
            chronology~available_from = temp~ownText()
        END
    ELSE
        DO
            chronology~available_from= ""
        END
    --COLUMN RENTAL_PERIOD <NVARCHAR>
    temp = adpage~select("tr:contains(Mietdauer) td")~last()
    IF ( temp \= .nil & temp \= "") THEN
        DO
            chronology~rental_period = temp~ownText()
        END
    ELSE
        DO
            chronology~rental_period= "nicht befristet"
        END

    --Extract & Transform for TABLE CONTACT_PERSON
    contact_person = .contact_person~NEW()
    --COLUMN CONTACT_INFO <NVARCHAR>
    temp = adpage~getElementById("telKontakt")~getElementsByTag("b")~first()
    IF ( temp \= .nil & temp \= "") THEN
        DO
            IF (temp \= "Telefon Allgemein:")THEN
            DO
                contact_person~contact_info = temp~ownText()
            END
        END
    ELSE
        DO
            contact_person~contact_info= ""
        END
    --COLUMN PHONE_NUMBER <NVARCHAR>
    temp = adpage~getElementById("khandyanzeigenMobile")
    IF ( temp \= .nil & temp \= "") THEN
        DO
            temp = adpage~getElementById("khandyanzeigenMobile")~getElementsByTag("b")~first()
            IF (temp \= "Telefon Allgemein:")THEN
                DO
                    temp = adpage~getElementById("khandyanzeigenMobile")~html()
                    temp = temp~delStr(39)
                    temp = temp~delWord(1,2)
                    contact_person~phone_number = temp
                END
        END
    ELSE
        DO
            contact_person~phone_number= ""
        END
    --COLUMN EMAIL <NVARCHAR>  not included on this page
    contact_person~email = ""

    --Extract & Transform for TABLE ADRESS
    adress = .adress~NEW()
    --COLUMN <NVARCHAR> COUNTRY (only entries from Vienna!)
    adress~country= "AUT"
    --COLUMN <NVARCHAR> ZIP_CODE
    temp = adpage~getElementsByClass("immo_header_value")~first()
    IF (temp \= .nil & temp \= "") THEN
        DO
            adress~zip_code=temp~ownText()
        END
    ELSE
        DO
            adress~zip_code=""
        END
    --COLUMN STREET <NVARCHAR> (no structured information about street)
    adress~street= ""
    --COLUMN FLOOR <NVARCHAR>
    temp = adpage~select("tr:contains(Stock) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            adress~floor=temp~ownText()
        END
    ELSE
        DO
            adress~floor=""
        END

    --Extract & Transform for TABLE CHARACTERISTIC
    characteristic = .characteristic~NEW()
    --COLUMN BALCONY <BOOLEAN>
    temp = adpage~select("tr:contains(Balkon) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            characteristic~balcony= "TRUE"
        END
    ELSE
        DO
            characteristic~balcony= "FALSE"
        END
    --COLUMN BASEMENT <BOOLEAN>
    temp = adpage~select("tr:contains(Keller) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            characteristic~basement="TRUE"
        END
    ELSE
        DO
            characteristic~basement= "FALSE"
        END
    --COLUMN TERRACE <BOOLEAN>
    temp = adpage~select("tr:contains(Terrassen) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            characteristic~terrace= "TRUE"
        END
    ELSE
        DO
            characteristic~terrace="FALSE"
        END
    -- PARKING_SPOT <BOOLEAN> (no information about parking spaces)
    characteristic~parking_spot= "NULL"
    --COLUMN ELEVATROR <BOOLEAN>
    temp = adpage~select("tr:contains(Lift) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            characteristic~elavator= "TRUE"
        END
    ELSE
        DO
            characteristic~elavator= "FALSE"
        END

    --Extract & Transform for TABLE KEY_FIGURES
    key_figures = .key_figures~NEW()
    --COLUMN <DOUBLE> TOTAL_AMOUNT
    temp = adpage~select("tr:contains(Monatliche Gesamtkosten:) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            temp = temp~ownText()~delWord(1,1)
            temp = temp~changeStr(".","")
            temp = temp~changeStr(",",".")
            key_figures~total_amount= temp
        END
    ELSE
        DO
            key_figures~total_amount= "NULL"
        END
    SAY url
    --COLUMN LIVING_SPACE <INT>
        temp = adpage~getElementsByClass("immo_header_value")
        IF (temp \= .nil & temp \= "" & temp~size > 2) THEN
            DO
                temp = adpage~getElementsByClass("immo_header_value")~get(2)
                temp = temp~ownText()~delWord(2)
                temp = temp~changeStr(".","")
                temp = temp~changeStr(",",".")
                key_figures~living_space= temp
            END
        ELSE
            DO
                key_figures~living_space= "NULL"
            END
    --COLUMN <DOUBLE> OPERATING_COST (no information about heating cost)
    temp = adpage~select("tr:contains(Betriebskosten (inkl. MwSt):) td")~last()
        IF (temp \= .nil & temp \= "") THEN
            DO
                temp = temp~ownText()~delWord(1,1)
                temp = temp~changeStr(".","")
                temp = temp~changeStr(",",".")
                key_figures~operating_cost= temp
            END
        ELSE
            DO
                key_figures~operating_cost= "NULL"
            END
    --COLUMN <DOUBLE> HEATING_COST (no information about heating cost)
    key_figures~heating_cost="NULL"
    --COLUMN <DOUBLE> DEPOSIT
    temp = adpage~select("tr:contains(Kaution:) td")~last()
        IF (temp \= .nil & temp \= "") THEN
            DO
                temp = temp~ownText()~delWord(1,1)
                temp = temp~changeStr(".","")
                temp = temp~changeStr(",",".")
                key_figures~deposit= temp
            END
        ELSE
            DO
                key_figures~deposit= "NULL"
            END
    --COLUMN  <NVARCHAR> COMMISSION
    temp = adpage~select("tr:contains(Provision) td")~first()
        IF (temp \= .nil & temp \= "") THEN
            DO
                temp = temp~ownText()~delWord(1,1)
                key_figures~commission= temp
            END
        ELSE
            DO
                key_figures~commission= ""
            END

    --Extract & Transform for TABLE CONSUMPTION
    consumption = .consumption~NEW()
    --COLUMN HWB_VALUE <INT> AND ENERGY_LABEL <NVARCHAR>
    temp = adpage~getElementById("energieausweis")
    IF (temp \= .nil & temp \= "") THEN
        DO
            temp = adpage~getElementById("energieausweis")~ownText
            parse var temp temp1 "kWh/m2/Jahr: " temp2 " Klasse" temp3 ": " temp4
            consumption~hwb_value= temp2~delWord(2)
            consumption~energy_label= SUBSTR(temp4,1,1)
        END
    ELSE
        DO
            consumption~hwb_value= ""
            consumption~energy_label= ""
        END
    --COLUMN ENERGY_TYPE <NVARCHAR>
    temp = adpage~select("tr:contains(Heizung) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            consumption~energy_type= temp~ownText()
        END
    ELSE
        DO
            consumption~energy_type= ""
        END

    --Extract & Transform for TABLE APARTMENT
    apartment = .apartment~NEW()
    --COLUMN ROOM_COUNT <INT>
    temp = adpage~select("tr:contains(Zimmer) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            temp = temp~ownText()
            apartment~room_count= temp

        END
    ELSE
        DO
            apartment~room_count= "NULL"
        END
    --COLUMN CONSTRUCTION_YEAR <NVARCHAR>
    temp = adpage~select("tr:contains(Baujahr) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            apartment~construction_year= temp~ownText()
        END
    ELSE
        DO
            apartment~construction_year= ""
        END
    --COLUMN CONSTRUCTION_TYPE <NVARCHAR>
    temp = adpage~select("tr:contains(Alt- oder Neubau) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            apartment~construction_type= temp~ownText()
        END
    ELSE
        DO
            apartment~construction_type= ""
        END
    --COLUMN CONDITION <NVARCHAR>
    temp = adpage~select("tr:contains(Zustand) td")~last()
    IF (temp \= .nil & temp \= "") THEN
        DO
            apartment~condition= temp~ownText()
        END
    ELSE
        DO
            apartment~condition= ""
        END

    --Extract & Transform for TABLE AD
    ad = .ad~NEW()
    --COLUMN <NVARCHAR> TITLE
    temp = adpage~getElementsByClass("col-xs-12 col-sm-12 col-md-12 col-lg-12 margin-top-10")~first()~getElementsByTag("h1")~first()
    IF (temp \= .nil & temp \= "") THEN
        DO
            ad~title= temp~ownText()
        END
    ELSE
        DO
            ad~title= ""
        END
    --COLUMN <NVARCHAR> DESCRIPTION
    ad~description = ""

    --create a JSON Object for the Ad Entry
    jsonAdEntry = .bsf~new("org.json.JSONObject")
    -- Add the Data to the JSONObject
    jsonAdEntry~~put("adress", adress~exportJSON)
    jsonAdEntry~~put("data_source", data_source~exportJSON)
    jsonAdEntry~~put("chronology", chronology~exportJSON)
    jsonAdEntry~~put("characteristic", characteristic~exportJSON)
    jsonAdEntry~~put("key_figures", key_figures~exportJSON)
    jsonAdEntry~~put("apartment", apartment~exportJSON)
    jsonAdEntry~~put("consumption", consumption~exportJSON)
    jsonAdEntry~~put("contact_person", contact_person~exportJSON)
    jsonAdEntry~~put("ad", ad~exportJSON)


    SAY jsonAdEntry~toString -- print JSON Objekt in console

    RETURN jsonAdEntry

----------------------------------------------------------------------------------------------------------------------

::CLASS data_source
    ::METHOD name ATTRIBUTE
    ::METHOD internal_id ATTRIBUTE
    ::METHOD url ATTRIBUTE
    ::METHOD exportCSV
        say self~name";"self~internal_id";"self~"url"
    ::METHOD exportJSON
        jsonObject = bsf.import("org.json.JSONObject")
        jsonArray = bsf.import("org.json.JSONArray")
        jsonData = jsonObject~NEW
        jsonData~~put("name", self~name)~~put("internal_id", self~internal_id)~~put("url", self~url)
        RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS chronology
    ::METHOD  creation_time ATTRIBUTE
    ::METHOD  available_from ATTRIBUTE
    ::METHOD  rental_period ATTRIBUTE
    ::METHOD exportCSV
        say self~creation_time";"self~available_from";"self~rental_period
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("creation_time", self~creation_time)~~put("available_from", self~available_from)~~put("rental_period", self~rental_period)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS contact_person
    ::METHOD contact_info ATTRIBUTE
    ::METHOD phone_number ATTRIBUTE
    ::METHOD email ATTRIBUTE
    ::METHOD exportCSV
            say self~contact_info";"self~phone_number";"self~email
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("contact_info", self~contact_info)~~put("phone_number", self~phone_number)~~put("email", self~email)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS adress
    ::METHOD  country ATTRIBUTE
    ::METHOD  zip_code ATTRIBUTE
    ::METHOD  street ATTRIBUTE
    ::METHOD  floor ATTRIBUTE
    ::METHOD exportCSV
            say self~country";"self~zip_code";"self~street";"self~floor
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("country", self~country)~~put("zip_code", self~zip_code)~~put("street", self~street)~~put("floor", self~floor)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS characteristic
    ::METHOD  balcony ATTRIBUTE
    ::METHOD  basement ATTRIBUTE
    ::METHOD  terrace ATTRIBUTE
    ::METHOD  parking_spot ATTRIBUTE
    ::METHOD  elavator ATTRIBUTE
    ::METHOD exportCSV
             say self~balcony";"self~basement";"self~terrace";"self~parking_spot";"self~elavator
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("balcony", self~balcony)~~put("basement", self~basement)~~put("terrace", self~terrace)~~put("parking_spot", self~parking_spot)~~put("elavator", self~elavator)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS key_figures
    ::METHOD  living_space ATTRIBUTE
    ::METHOD  total_amount ATTRIBUTE
    ::METHOD  operating_cost ATTRIBUTE
    ::METHOD  heating_cost ATTRIBUTE
    ::METHOD  deposit ATTRIBUTE
    ::METHOD  commission ATTRIBUTE
    ::METHOD exportCSV
            say self~living_space";"self~total_amount";"self~operating_cost";"self~heating_cost";"self~deposit";"self~commission
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("total_amount", self~total_amount)~~put("living_space", self~living_space)~~put("operating_cost", self~operating_cost)~~put("heating_cost", self~heating_cost)~~put("deposit", self~deposit)~~put("commission", self~commission)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS consumption
    ::METHOD  hwb_value ATTRIBUTE
    ::METHOD  energy_label ATTRIBUTE
    ::METHOD  energy_type ATTRIBUTE
    ::METHOD exportCSV
            say self~hwb_value";"self~energy_label";"self~energy_type
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("hwb_value", self~hwb_value)~~put("energy_label", self~energy_label)~~put("energy_type", self~energy_type)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS apartment
    ::METHOD  room_count ATTRIBUTE
    ::METHOD  construction_year ATTRIBUTE
    ::METHOD  construction_type ATTRIBUTE
    ::METHOD  condition ATTRIBUTE
    ::METHOD exportCSV
            say self~room_count";"self~construction_year";"self~construction_type";"self~condition
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("room_count", self~room_count)~~put("construction_year", self~construction_year)~~put("construction_type", self~construction_type)~~put("condition", self~condition)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------

::CLASS ad
    ::METHOD  title ATTRIBUTE
    ::METHOD  description ATTRIBUTE
    ::METHOD exportCSV
            say self~title";"self~description
    ::METHOD exportJSON
            jsonObject = bsf.import("org.json.JSONObject")
            jsonArray = bsf.import("org.json.JSONArray")
            jsonData = jsonObject~NEW
            jsonData~~put("title", self~title)~~put("description", self~description)
            RETURN jsonData

----------------------------------------------------------------------------------------------------------------------
